diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f4b97efa..c72baef1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,8 +35,8 @@ jobs: cd dev/dev-tools mkdir -p vendor npm install tailwindcss @tailwindcss/cli - wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui.mjs -O vendor/daisyui.mjs - wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui-theme.mjs -O vendor/daisyui-theme.mjs + wget https://github.com/saadeghi/daisyui/releases/download/v5.5.19/daisyui.mjs -O vendor/daisyui.mjs + wget https://github.com/saadeghi/daisyui/releases/download/v5.5.19/daisyui-theme.mjs -O vendor/daisyui-theme.mjs npx @tailwindcss/cli -i tailwind.css -o assets/tailwind.css - name: Check formatting @@ -56,6 +56,22 @@ jobs: - name: Check for unused dependencies run: cargo shear + dev_tools: + name: Dev Tools + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ./.github/actions/free-disk-space + - uses: cachix/install-nix-action@v31 + with: + github_access_token: ${{ secrets.GITHUB_TOKEN }} + - uses: Swatinem/rust-cache@v2 + with: + shared-key: ci-${{ runner.os }} + save-if: ${{ github.ref == 'refs/heads/main' }} + - name: Build dev tools + run: nix develop -c bash -lc 'cd dev/dev-tools && dx build' + unit_test: name: Unit Test runs-on: ubuntu-latest @@ -74,8 +90,8 @@ jobs: cd dev/dev-tools mkdir -p vendor npm install tailwindcss @tailwindcss/cli - wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui.mjs -O vendor/daisyui.mjs - wget https://github.com/saadeghi/daisyui/releases/latest/download/daisyui-theme.mjs -O vendor/daisyui-theme.mjs + wget https://github.com/saadeghi/daisyui/releases/download/v5.5.19/daisyui.mjs -O vendor/daisyui.mjs + wget https://github.com/saadeghi/daisyui/releases/download/v5.5.19/daisyui-theme.mjs -O vendor/daisyui-theme.mjs npx @tailwindcss/cli -i tailwind.css -o assets/tailwind.css - name: Generate code coverage run: cargo llvm-cov --workspace --codecov --output-path codecov.json @@ -150,13 +166,13 @@ jobs: cargo llvm-cov clean --workspace cargo build --bin bench_server cargo build --bin clickbench_client - env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-cache-mb 256 &> server.log & + env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-memory-mb 256 &> server.log & sleep 2 # Wait for server to start up env RUST_LOG=info cargo run --bin clickbench_client -- --manifest benchmark/clickbench/benchmark_manifest.json echo "=== Server logs ===" cat server.log || echo "No server log found" curl http://localhost:53703/shutdown - env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/clickbench/benchmark_manifest.json --bench-mode liquid --max-cache-mb 256 + env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/clickbench/benchmark_manifest.json --bench-mode liquid --max-memory-mb 256 cargo llvm-cov report --codecov --output-path codecov_clickbench.json - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 @@ -190,13 +206,13 @@ jobs: cargo llvm-cov clean --workspace cargo build --bin bench_server cargo build --bin tpch_client - env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-cache-mb 256 &> server.log & + env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-memory-mb 256 &> server.log & sleep 2 # Wait for server to start up env RUST_LOG=info cargo run --bin tpch_client -- --manifest benchmark/tpch/manifest.json --answer-dir benchmark/tpch/answers/sf0.1 echo "=== Server logs ===" cat server.log || echo "No server log found" curl http://localhost:53703/shutdown - env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/tpch/manifest.json --bench-mode liquid --max-cache-mb 256 + env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/tpch/manifest.json --bench-mode liquid --max-memory-mb 256 cargo llvm-cov report --codecov --output-path codecov_tpch.json - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 @@ -205,45 +221,46 @@ jobs: files: codecov_tpch.json fail_ci_if_error: true - tpcds: - name: TPC-DS - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/free-disk-space - - uses: dtolnay/rust-toolchain@stable - - run: sudo apt-get update && sudo apt-get install -y wget - - name: Install cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov - - uses: Swatinem/rust-cache@v2 - with: - shared-key: ci-${{ runner.os }} - save-if: ${{ github.ref == 'refs/heads/main' }} - - name: Setup TPC-DS data - run: | - curl -LsSf https://astral.sh/uv/install.sh | sh - cd benchmark/tpcds - uvx --from duckdb python tpcds_gen.py --scale 0.1 --answers-dir answers --data-dir data --queries-dir queries - - name: Run TPC-DS - run: | - source <(cargo llvm-cov show-env --export-prefix) - cargo llvm-cov clean --workspace - cargo build --bin bench_server - cargo build --bin tpcds_client - env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-cache-mb 256 &> server.log & - sleep 2 # Wait for server to start up - env RUST_LOG=info cargo run --bin tpcds_client -- --manifest benchmark/tpcds/manifest.json --answer-dir benchmark/tpcds/answers/sf0.1 - echo "=== Server logs ===" - cat server.log || echo "No server log found" - curl http://localhost:53703/shutdown - env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/tpcds/manifest.json --bench-mode liquid --max-cache-mb 256 - cargo llvm-cov report --codecov --output-path codecov_tpcds.json - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - files: codecov_tpcds.json - fail_ci_if_error: true + # Diable for now, because the upstream datafusion has a bug. + # tpcds: + # name: TPC-DS + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v4 + # - uses: ./.github/actions/free-disk-space + # - uses: dtolnay/rust-toolchain@stable + # - run: sudo apt-get update && sudo apt-get install -y wget + # - name: Install cargo-llvm-cov + # uses: taiki-e/install-action@cargo-llvm-cov + # - uses: Swatinem/rust-cache@v2 + # with: + # shared-key: ci-${{ runner.os }} + # save-if: ${{ github.ref == 'refs/heads/main' }} + # - name: Setup TPC-DS data + # run: | + # curl -LsSf https://astral.sh/uv/install.sh | sh + # cd benchmark/tpcds + # uvx --from duckdb python tpcds_gen.py --scale 0.1 --answers-dir answers --data-dir data --queries-dir queries + # - name: Run TPC-DS + # run: | + # source <(cargo llvm-cov show-env --export-prefix) + # cargo llvm-cov clean --workspace + # cargo build --bin bench_server + # cargo build --bin tpcds_client + # env RUST_LOG=info nohup cargo run --bin bench_server -- --abort-on-panic --cache-mode liquid --max-memory-mb 256 &> server.log & + # sleep 2 # Wait for server to start up + # env RUST_LOG=info cargo run --bin tpcds_client -- --manifest benchmark/tpcds/manifest.json --answer-dir benchmark/tpcds/answers/sf0.1 + # echo "=== Server logs ===" + # cat server.log || echo "No server log found" + # curl http://localhost:53703/shutdown + # env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/tpcds/manifest.json --bench-mode liquid --max-memory-mb 256 + # cargo llvm-cov report --codecov --output-path codecov_tpcds.json + # - name: Upload coverage to Codecov + # uses: codecov/codecov-action@v5 + # with: + # token: ${{ secrets.CODECOV_TOKEN }} + # files: codecov_tpcds.json + # fail_ci_if_error: true stackoverflow: name: StackOverflow @@ -291,7 +308,7 @@ jobs: source <(cargo llvm-cov show-env --export-prefix) cargo llvm-cov clean --workspace cargo build --bin in_process - env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/stackoverflow/manifest.dba.json --bench-mode liquid --max-cache-mb 10 + env RUST_LOG=info cargo run --bin in_process -- --manifest benchmark/stackoverflow/manifest.dba.json --bench-mode liquid --max-memory-mb 10 cargo llvm-cov report --codecov --output-path codecov_stackoverflow.json - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 @@ -300,143 +317,143 @@ jobs: files: codecov_stackoverflow.json fail_ci_if_error: true - benchmark: - name: Performance Benchmark - runs-on: pittsburgh - permissions: - contents: write - pull-requests: write - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - token: ${{ secrets.GITHUB_TOKEN }} - - uses: dtolnay/rust-toolchain@stable - - name: Setup ClickBench partitioned data download - run: | - mkdir -p benchmark/clickbench/data - for partition in 0 1 2 3; do - echo "Downloading partition ${partition}..." - wget "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_${partition}.parquet" \ - -O "benchmark/clickbench/data/hits_${partition}.parquet" - done - - - name: Update manifest for partitioned data - run: | - # Update the manifest to point to the partitioned data directory - sed 's|"benchmark/clickbench/data/hits.parquet"|"benchmark/clickbench/data"|' \ - benchmark/clickbench/manifest.json > benchmark/clickbench/benchmark_manifest.json - - - name: Build benchmark binary - run: cargo build --release --bin in_process - - - name: Run LiquidCache benchmark (in-process) - run: | - mkdir -p benchmark_results - env RUST_LOG=info cargo run --release --bin in_process -- \ - --manifest benchmark/clickbench/benchmark_manifest.json \ - --output benchmark_results/liquid.json \ - --iteration 5 \ - --reset-cache \ - --bench-mode liquid \ - --max-cache-mb 64 - - - name: Run DataFusion benchmark (plain parquet) - run: | - env RUST_LOG=info cargo run --release --bin in_process -- \ - --manifest benchmark/clickbench/benchmark_manifest.json \ - --output benchmark_results/parquet.json \ - --iteration 5 \ - --bench-mode parquet - - - name: Run DataFusion benchmark (default config) - run: | - env RUST_LOG=info cargo run --release --bin in_process -- \ - --manifest benchmark/clickbench/benchmark_manifest.json \ - --output benchmark_results/df_default.json \ - --iteration 5 \ - --bench-mode datafusion-default - - - name: Annotate results with commit/timestamp - run: | - jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \ - '. + {"timestamp": $timestamp, "commit": $commit}' \ - benchmark_results/liquid.json > benchmark_results/liquid_final.json - jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \ - '. + {"timestamp": $timestamp, "commit": $commit}' \ - benchmark_results/parquet.json > benchmark_results/parquet_final.json - jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \ - '. + {"timestamp": $timestamp, "commit": $commit}' \ - benchmark_results/df_default.json > benchmark_results/df_default_final.json - - - name: Compare LiquidCache vs DataFusion (same runner) - id: compare - run: | - python3 .github/compare_benchmarks.py \ - benchmark_results/liquid_final.json \ - benchmark_results/df_default_final.json \ - --output comparison.md - echo "COMPARISON_AVAILABLE=true" >> $GITHUB_OUTPUT - - - name: Comment PR with benchmark results - if: steps.compare.outputs.COMPARISON_AVAILABLE == 'true' && github.event_name == 'pull_request' - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - - let comment = ''; - try { - comment = fs.readFileSync('comparison.md', 'utf8'); - } catch (error) { - comment = 'Error reading benchmark comparison results'; - } - - // Check if this is an external PR (from a fork) - const isExternalPR = context.payload.pull_request.head.repo.full_name !== context.payload.pull_request.base.repo.full_name; - - if (isExternalPR) { - console.log('Skipping comment for external PR due to permission restrictions'); - console.log('Benchmark results:'); - console.log(comment); - return; - } - - try { - // Find existing benchmark comment - const comments = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); - - const botComment = comments.data.find(comment => - comment.user.type === 'Bot' && - comment.body.includes('## 📊 Benchmark Comparison') - ); - - if (botComment) { - // Update existing comment - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: botComment.id, - body: comment - }); - } else { - // Create new comment - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: comment - }); - } - } catch (error) { - console.log('Failed to post comment, likely due to permissions:', error.message); - console.log('Benchmark results:'); - console.log(comment); - } + # benchmark: + # name: Performance Benchmark + # runs-on: pittsburgh + # permissions: + # contents: write + # pull-requests: write + # steps: + # - uses: actions/checkout@v4 + # with: + # fetch-depth: 0 + # token: ${{ secrets.GITHUB_TOKEN }} + # - uses: dtolnay/rust-toolchain@stable + # - name: Setup ClickBench partitioned data download + # run: | + # mkdir -p benchmark/clickbench/data + # for partition in 0 1 2 3; do + # echo "Downloading partition ${partition}..." + # wget "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_${partition}.parquet" \ + # -O "benchmark/clickbench/data/hits_${partition}.parquet" + # done + + # - name: Update manifest for partitioned data + # run: | + # # Update the manifest to point to the partitioned data directory + # sed 's|"benchmark/clickbench/data/hits.parquet"|"benchmark/clickbench/data"|' \ + # benchmark/clickbench/manifest.json > benchmark/clickbench/benchmark_manifest.json + + # - name: Build benchmark binary + # run: cargo build --release --bin in_process + + # - name: Run LiquidCache benchmark (in-process) + # run: | + # mkdir -p benchmark_results + # env RUST_LOG=info cargo run --release --bin in_process -- \ + # --manifest benchmark/clickbench/benchmark_manifest.json \ + # --output benchmark_results/liquid.json \ + # --iteration 5 \ + # --reset-cache \ + # --bench-mode liquid \ + # --max-memory-mb 64 + + # - name: Run DataFusion benchmark (plain parquet) + # run: | + # env RUST_LOG=info cargo run --release --bin in_process -- \ + # --manifest benchmark/clickbench/benchmark_manifest.json \ + # --output benchmark_results/parquet.json \ + # --iteration 5 \ + # --bench-mode parquet + + # - name: Run DataFusion benchmark (default config) + # run: | + # env RUST_LOG=info cargo run --release --bin in_process -- \ + # --manifest benchmark/clickbench/benchmark_manifest.json \ + # --output benchmark_results/df_default.json \ + # --iteration 5 \ + # --bench-mode datafusion-default + + # - name: Annotate results with commit/timestamp + # run: | + # jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \ + # '. + {"timestamp": $timestamp, "commit": $commit}' \ + # benchmark_results/liquid.json > benchmark_results/liquid_final.json + # jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \ + # '. + {"timestamp": $timestamp, "commit": $commit}' \ + # benchmark_results/parquet.json > benchmark_results/parquet_final.json + # jq --arg timestamp "$(date -Iminutes)" --arg commit "${{ github.sha }}" \ + # '. + {"timestamp": $timestamp, "commit": $commit}' \ + # benchmark_results/df_default.json > benchmark_results/df_default_final.json + + # - name: Compare LiquidCache vs DataFusion (same runner) + # id: compare + # run: | + # python3 .github/compare_benchmarks.py \ + # benchmark_results/liquid_final.json \ + # benchmark_results/df_default_final.json \ + # --output comparison.md + # echo "COMPARISON_AVAILABLE=true" >> $GITHUB_OUTPUT + + # - name: Comment PR with benchmark results + # if: steps.compare.outputs.COMPARISON_AVAILABLE == 'true' && github.event_name == 'pull_request' + # uses: actions/github-script@v7 + # with: + # script: | + # const fs = require('fs'); + + # let comment = ''; + # try { + # comment = fs.readFileSync('comparison.md', 'utf8'); + # } catch (error) { + # comment = 'Error reading benchmark comparison results'; + # } + + # // Check if this is an external PR (from a fork) + # const isExternalPR = context.payload.pull_request.head.repo.full_name !== context.payload.pull_request.base.repo.full_name; + + # if (isExternalPR) { + # console.log('Skipping comment for external PR due to permission restrictions'); + # console.log('Benchmark results:'); + # console.log(comment); + # return; + # } + + # try { + # // Find existing benchmark comment + # const comments = await github.rest.issues.listComments({ + # owner: context.repo.owner, + # repo: context.repo.repo, + # issue_number: context.issue.number, + # }); + + # const botComment = comments.data.find(comment => + # comment.user.type === 'Bot' && + # comment.body.includes('## 📊 Benchmark Comparison') + # ); + + # if (botComment) { + # // Update existing comment + # await github.rest.issues.updateComment({ + # owner: context.repo.owner, + # repo: context.repo.repo, + # comment_id: botComment.id, + # body: comment + # }); + # } else { + # // Create new comment + # await github.rest.issues.createComment({ + # owner: context.repo.owner, + # repo: context.repo.repo, + # issue_number: context.issue.number, + # body: comment + # }); + # } + # } catch (error) { + # console.log('Failed to post comment, likely due to permissions:', error.message); + # console.log('Benchmark results:'); + # console.log(comment); + # } examples: name: Run client/server/inprocess examples @@ -504,15 +521,3 @@ jobs: run: | # Run to populate cache and read arrow array env RUST_LOG=info cargo run --bin example_inprocess_read - - kani: - name: Run Kani proofs - runs-on: ubuntu-22.04 - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - uses: ./.github/actions/free-disk-space - - name: Verify storage crate with Kani - uses: model-checking/kani-github-action@v1.1 - with: - working-directory: src/core diff --git a/AGENTS.md b/AGENTS.md index 3bd9b5e9..2e8d8c22 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,15 +13,6 @@ - `src/datafusion-client` and `src/datafusion-server`, Client/Server library, this enables distributed LiquidCache. - `src/datafusion-local`, this is a in-process LiquidCache, used for local DataFusion instances. -## Study guide - -This repo has many studies, they are not for production use, but only for research purposes to understand how the system behaves. -The coding guidelines for study are different: - -1. Focus on concise, minimal intrusive, easy to understand code. -2. No error handling, no robust edge case handling, just one shot code. -3. Ok to hard code if it can simplify the implementation. - ### Lineage-based cache expression 1. The lineage_opt.rs analyze the input query's column usage, and passes it down to LiquidCache. diff --git a/Cargo.lock b/Cargo.lock index 164d2b88..61dbc368 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,9 +96,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -338,7 +338,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.13.0", + "indexmap 2.14.0", "itoa", "lexical-core", "memchr", @@ -431,9 +431,9 @@ checksum = "bfdc70193dadb9d7287fa4b633f15f90c876915b31f6af17da307fc59c9859a8" [[package]] name = "async-compression" -version = "0.4.41" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1" +checksum = "e79b3f8a79cccc2898f31920fc69f304859b3bd567490f75ebf51ae1c792a9ac" dependencies = [ "compression-codecs", "compression-core", @@ -513,9 +513,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "axum" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" dependencies = [ "axum-core", "axum-macros", @@ -542,7 +542,7 @@ dependencies = [ "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.29.0", "tower", "tower-layer", "tower-service", @@ -593,9 +593,9 @@ dependencies = [ [[package]] name = "axum-macros" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" +checksum = "7aa268c23bfbbd2c4363b9cd302a4f504fb2a9dfe7e3451d66f35dd392e20aca" dependencies = [ "proc-macro2", "quote", @@ -656,9 +656,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.11.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" dependencies = [ "serde_core", ] @@ -686,16 +686,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.8.3" +version = "1.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +checksum = "0aa83c34e62843d924f905e0f5c866eb1dd6545fc4d719e803d9ba6030371fce" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "cpufeatures 0.2.17", + "cpufeatures 0.3.0", ] [[package]] @@ -707,15 +707,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "block2" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5" -dependencies = [ - "objc2", -] - [[package]] name = "brotli" version = "8.0.2" @@ -781,9 +772,9 @@ checksum = "cd17eb909a8c6a894926bfcc3400a4bb0e732f5a57d37b1f14e8b29e329bace8" [[package]] name = "cc" -version = "1.2.57" +version = "1.2.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" dependencies = [ "find-msvc-tools", "jobserver", @@ -828,7 +819,7 @@ checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", "cpufeatures 0.3.0", - "rand_core 0.10.0", + "rand_core 0.10.1", ] [[package]] @@ -894,9 +885,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.6.0" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" dependencies = [ "clap_builder", "clap_derive", @@ -917,9 +908,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.6.0" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" dependencies = [ "heck", "proc-macro2", @@ -929,15 +920,15 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "combine" @@ -961,9 +952,9 @@ dependencies = [ [[package]] name = "compression-codecs" -version = "0.4.37" +version = "0.4.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7" +checksum = "ce2548391e9c1929c21bf6aa2680af86fe4c1b33e6cea9ac1cfeec0bd11218cf" dependencies = [ "bzip2", "compression-core", @@ -976,9 +967,9 @@ dependencies = [ [[package]] name = "compression-core" -version = "0.4.31" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" +checksum = "cc14f565cf027a105f7a44ccf9e5b424348421a1d8952a8fc9d499d313107789" [[package]] name = "condtype" @@ -1068,6 +1059,12 @@ dependencies = [ "syn", ] +[[package]] +name = "const-siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03efed02df0504d71e44bfd51d3329f401b8303a2fd14254acf05c95a0af0153" + [[package]] name = "const-str" version = "0.7.1" @@ -1082,11 +1079,12 @@ checksum = "d9c50fcfdf972929aff202c16b80086aa3cfc6a3a820af714096c58c7c1d0582" [[package]] name = "const_format" -version = "0.2.35" +version = "0.2.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7faa7469a93a566e9ccc1c73fe783b4a65c274c5ace346038dca9c39fe0030ad" +checksum = "4481a617ad9a412be3b97c5d403fef8ed023103368908b9c50af598ff467cc1e" dependencies = [ "const_format_proc_macros", + "konst", ] [[package]] @@ -1188,11 +1186,24 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "corosensei" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c54787b605c7df106ceccf798df23da4f2e09918defad66705d1cedf3bb914f" +dependencies = [ + "autocfg", + "cfg-if", + "libc", + "scopeguard", + "windows-sys 0.59.0", +] + [[package]] name = "cpp_demangle" -version = "0.5.1" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0667304c32ea56cb4cd6d2d7c0cfe9a2f8041229db8c033af7f8d69492429def" +checksum = "f2bb79cb74d735044c972aae58ed0aaa9a837e85b01106a54c39e42e97f62253" dependencies = [ "cfg-if", ] @@ -1335,15 +1346,15 @@ dependencies = [ [[package]] name = "data-encoding" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" +checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8" [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", "arrow-schema", @@ -1384,7 +1395,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "sqlparser", "tempfile", @@ -1396,9 +1407,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1421,9 +1432,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1444,9 +1455,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "arrow", @@ -1454,7 +1465,7 @@ dependencies = [ "chrono", "half", "hashbrown 0.16.1", - "indexmap 2.13.0", + "indexmap 2.14.0", "itertools", "libc", "log", @@ -1469,9 +1480,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1480,9 +1491,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-compression", @@ -1506,7 +1517,7 @@ dependencies = [ "liblzma", "log", "object_store", - "rand 0.9.2", + "rand 0.9.4", "tokio", "tokio-util", "url", @@ -1515,9 +1526,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ "arrow", "arrow-ipc", @@ -1539,9 +1550,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", @@ -1562,9 +1573,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", @@ -1586,9 +1597,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ "arrow", "async-trait", @@ -1616,15 +1627,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", "arrow-buffer", @@ -1638,16 +1649,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", "async-trait", @@ -1658,7 +1669,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap 2.13.0", + "indexmap 2.14.0", "itertools", "paste", "recursive", @@ -1668,22 +1679,22 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.13.0", + "indexmap 2.14.0", "itertools", "paste", ] [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", @@ -1704,7 +1715,7 @@ dependencies = [ "md-5", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", "sha2", "unicode-segmentation", @@ -1713,9 +1724,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -1735,9 +1746,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -1748,9 +1759,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ "arrow", "arrow-ord", @@ -1773,9 +1784,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -1789,9 +1800,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ "arrow", "datafusion-common", @@ -1807,9 +1818,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1817,9 +1828,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", @@ -1828,9 +1839,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", @@ -1838,7 +1849,7 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-physical-expr", - "indexmap 2.13.0", + "indexmap 2.14.0", "itertools", "log", "recursive", @@ -1848,9 +1859,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -1861,7 +1872,7 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.16.1", - "indexmap 2.13.0", + "indexmap 2.14.0", "itertools", "parking_lot", "paste", @@ -1872,9 +1883,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ "arrow", "datafusion-common", @@ -1887,9 +1898,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", @@ -1897,16 +1908,16 @@ dependencies = [ "datafusion-common", "datafusion-expr-common", "hashbrown 0.16.1", - "indexmap 2.13.0", + "indexmap 2.14.0", "itertools", "parking_lot", ] [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -1923,9 +1934,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", @@ -1944,7 +1955,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "indexmap 2.13.0", + "indexmap 2.14.0", "itertools", "log", "num-traits", @@ -1955,9 +1966,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677ee4448a010ed5faeff8d73ff78972c2ace59eff3cd7bd15833a1dafa00492" +checksum = "6a387aaef949dc16bb6abc81bd1af850ec7449183aef011214f9724957495738" dependencies = [ "arrow", "chrono", @@ -1978,14 +1989,14 @@ dependencies = [ "datafusion-proto-common", "object_store", "prost", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] name = "datafusion-proto-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "965eca01edc8259edbbd95883a00b6d81e329fd44a019cfac3a03b026a83eade" +checksum = "16e614c7c53a9c304c6a850b821010bb492e57300311835f1180613f9d2c63d9" dependencies = [ "arrow", "datafusion-common", @@ -1994,9 +2005,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", "datafusion-common", @@ -2011,9 +2022,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" dependencies = [ "async-trait", "datafusion-common", @@ -2025,9 +2036,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", @@ -2035,7 +2046,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-functions-nested", - "indexmap 2.13.0", + "indexmap 2.14.0", "log", "recursive", "regex", @@ -2114,9 +2125,9 @@ dependencies = [ [[package]] name = "dioxus" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d5b0aec58753daee127a5fe2d1a40b0db8cebc0b8a7f97b34df2492cb90d78e" +checksum = "a44c550c06b6785e16258ad620d5b559f5bbcbcc50e3c18c08aa6af2604a4c32" dependencies = [ "dioxus-asset-resolver", "dioxus-cli-config", @@ -2147,9 +2158,9 @@ dependencies = [ [[package]] name = "dioxus-asset-resolver" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c240c4f092024b26e200ecd64723009173cf5bc2e5083c9feb778c077eb5741b" +checksum = "a8b546050ecfc7fcd310be344b2f3a2a79f21554c5a9e8df28e7a07e9b36009b" dependencies = [ "dioxus-cli-config", "http", @@ -2168,18 +2179,18 @@ dependencies = [ [[package]] name = "dioxus-cli-config" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86a13d42c5defcea333bdbae1dc5d64d078acd0fda1d8a1441c37e06be5146e3" +checksum = "d4ad73f0ff638cd27466d389cd57f0975f909b66130dc1c25d5212d4041e5352" dependencies = [ "wasm-bindgen", ] [[package]] name = "dioxus-config-macro" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ba1d68a05a8a15293ba65d45c7a3263356f3eedf1a3e599440683f3eb014637" +checksum = "e004bc8b958031117d373db2b5e0ab9d7e763751de129dd36f00ef7e318333cd" dependencies = [ "proc-macro2", "quote", @@ -2187,15 +2198,15 @@ dependencies = [ [[package]] name = "dioxus-config-macros" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43f2d511d3c3c439a2fb7f863668b84caf8e0d2440cbfbcbb28521e26ba7f44" +checksum = "808a9994a9a2623e6b6890b6cc68def24bd669177ec4713684447fb46418c256" [[package]] name = "dioxus-core" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb3dd61889e6a09daec93d44db86047fb8e6603beedcf9351b8528582254e075" +checksum = "247ed8d679a13232641f1c84ba22246623fae01320b4c22db225c0b4f2fa7398" dependencies = [ "anyhow", "const_format", @@ -2204,7 +2215,7 @@ dependencies = [ "futures-util", "generational-box", "longest-increasing-subsequence", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "rustversion", "serde", "slab", @@ -2215,9 +2226,9 @@ dependencies = [ [[package]] name = "dioxus-core-macro" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8577c4d9a8cc23423c4d2137319044b03ab940e4b2790dd25f4f06601bd32d9a" +checksum = "75fbe64029b90144041f8521300c7b3508e6c48caea3244de2ff5d1ade15390c" dependencies = [ "convert_case 0.8.0", "dioxus-rsx", @@ -2228,15 +2239,15 @@ dependencies = [ [[package]] name = "dioxus-core-types" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b99d7d199aad72431b549759550002e7d72c8a257eba500dca9fbdb2122de103" +checksum = "cbfea5c8946e0745b254b5c33c515d81b3ba638f33c2a532ed06730392394d4d" [[package]] name = "dioxus-devtools" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d27e7212436a581ce058d7554f1383916bd18a68ebd6015b0b4c2e9ecb0d5535" +checksum = "08d30370fa78266aed3f3d9119dea2de9e92a0348941dbfa777f7a669f2ea375" dependencies = [ "dioxus-cli-config", "dioxus-core", @@ -2254,9 +2265,9 @@ dependencies = [ [[package]] name = "dioxus-devtools-types" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aa24ed651b97e0b423270bf07a0f1b7dc0e0fa1f1dc26407cd2a118d6bf9de5" +checksum = "3907a2b61cf56039f047da6a37317a03d9e15411753bc40e5e34a27e405ac320" dependencies = [ "dioxus-core", "serde", @@ -2265,9 +2276,9 @@ dependencies = [ [[package]] name = "dioxus-document" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24685cb51cc6227ea606c49dfe531836f362c49183d3007241afcd8827498401" +checksum = "e3b75a1809af7c13546ae4487c8b02ab80cc4d059e7db5a5d090374ceaa71d5b" dependencies = [ "dioxus-core", "dioxus-core-macro", @@ -2284,9 +2295,9 @@ dependencies = [ [[package]] name = "dioxus-fullstack" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5940c870751b6273a23b7c0e16d80039f45604d68d9b86c91e27b09edeabeb9e" +checksum = "c8ee56dd65fbf1222fa6a2749c3f821df28facf1832854b43d480b547a096f6d" dependencies = [ "anyhow", "async-stream", @@ -2332,7 +2343,7 @@ dependencies = [ "thiserror 2.0.18", "tokio", "tokio-stream", - "tokio-tungstenite", + "tokio-tungstenite 0.28.0", "tokio-util", "tower", "tower-http", @@ -2349,9 +2360,9 @@ dependencies = [ [[package]] name = "dioxus-fullstack-core" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28333274cfc8e5fe547ab04258c2511350c4930a07af9616d365dc4ba7b22d8f" +checksum = "d40d33a447cb158acdb61787b2ff52dd8a0f9a9f20e95e5c5fe9873f01c2b55b" dependencies = [ "anyhow", "axum-core", @@ -2377,9 +2388,9 @@ dependencies = [ [[package]] name = "dioxus-fullstack-macro" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f7e5a9fa7f657aa519a07aced8b8936f3ae8a246d94855d497d8cce59b9533" +checksum = "7c06eb66bce50d5f47b793e6af5fc2e0a511bf2b4fa2423cf86e35023d8f17e6" dependencies = [ "const_format", "convert_case 0.8.0", @@ -2391,9 +2402,9 @@ dependencies = [ [[package]] name = "dioxus-history" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "010b446322b3f9176476579fa61c7552f0430abbeec418cab543482da6ca4363" +checksum = "c1d8024afd482956eadae2c43d0b1e73e584adb724ac09be87f268d52002387b" dependencies = [ "dioxus-core", "tracing", @@ -2401,9 +2412,9 @@ dependencies = [ [[package]] name = "dioxus-hooks" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09e7a6ba279050cc161e1215c6db0bd15915c9314ec2916d7b22c113a3039536" +checksum = "233b5e168a7c38c4bf96d0f390221a72a5a28bb31ce2dcf6d2af879dc561ef42" dependencies = [ "dioxus-core", "dioxus-signals", @@ -2417,9 +2428,9 @@ dependencies = [ [[package]] name = "dioxus-html" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0715e38cc6537aef5b79d0ddc1f4d7a56c2f4debe46b127eee24d8aa5dafd2d" +checksum = "4abf4ad27eee650d1ab8ebe13591e8b0ee595fa5a5dd236be13a5b7b3fab678d" dependencies = [ "async-trait", "bytes", @@ -2444,9 +2455,9 @@ dependencies = [ [[package]] name = "dioxus-html-internal-macro" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff6b7918b0908c8719a6165b4e3c362da4fd311fc7cb48720eddd8a45b2ddfc6" +checksum = "025e107e677f790f4ed648a189e36f51ae014e5341902b97313e4eae626cffa2" dependencies = [ "convert_case 0.8.0", "proc-macro2", @@ -2456,16 +2467,16 @@ dependencies = [ [[package]] name = "dioxus-interpreter-js" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8ce1cf487007f90d0ec4ec87dff111d74ac04fca0918f9dcc4e80dc3b0531b2" +checksum = "57caa76427d8ec4105ccca44ff8c511055688af732a094b9fe9ef3547d2a11b2" dependencies = [ "dioxus-core", "dioxus-core-types", "dioxus-html", "js-sys", "lazy-js-bundle", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "sledgehammer_bindgen", "sledgehammer_utils", "wasm-bindgen", @@ -2475,9 +2486,9 @@ dependencies = [ [[package]] name = "dioxus-liveview" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9407df2eec82681fa2195282762dddacc40563445df36b3ad1df9d69d4eaa073" +checksum = "64e64d86ad604897c796fdcc1f1f42cab838258647dd47664ab637a8eb97f08e" dependencies = [ "axum", "dioxus-cli-config", @@ -2490,7 +2501,7 @@ dependencies = [ "futures-channel", "futures-util", "generational-box", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "serde", "serde_json", "slab", @@ -2503,9 +2514,9 @@ dependencies = [ [[package]] name = "dioxus-logger" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4742b16791a71eb4db2d0747f15c50b278b27369b3d93e5a4d6ec2570bcb9bc" +checksum = "0cbbee192b1b12fccb444a5b04d809710cfce4d27b792129fea6c845fae7f329" dependencies = [ "dioxus-cli-config", "tracing", @@ -2515,9 +2526,9 @@ dependencies = [ [[package]] name = "dioxus-router" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae50f5efa8d6f936c0c3bb85d7a55f6f19290f106290e331d1136d964e832fe6" +checksum = "ecdb19d7a1489ba252be9b3a07f9db92814020eb4ba8c1306f04242a44d17e66" dependencies = [ "dioxus-cli-config", "dioxus-core", @@ -2536,9 +2547,9 @@ dependencies = [ [[package]] name = "dioxus-router-macro" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae9beca02f6baca4b223256805536dc92e77a1541bb2331723100f66aae79332" +checksum = "b525ab775585f1dc4850178de9d0cb6bb37f7b9c1a1a0c121eab7449d7021480" dependencies = [ "base16", "digest", @@ -2551,9 +2562,9 @@ dependencies = [ [[package]] name = "dioxus-rsx" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "344621f6dc435e76fbe272da09988d0118cf35cc2aa88ebb5ae7c1317a36e57c" +checksum = "37fb07e40e9734946511659668ea3675ed214b60889e7aa7f0a5a85271518475" dependencies = [ "proc-macro2", "proc-macro2-diagnostics", @@ -2564,9 +2575,9 @@ dependencies = [ [[package]] name = "dioxus-server" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d04f3e741d9b866f38c20f368fdf84226b27ca341fa0108cf2e0bf6cdb40c7e7" +checksum = "5aa22ad381073c68a70cdb28152485abb5f2dcf20dabc68c631e26ce7ac046dd" dependencies = [ "anyhow", "async-trait", @@ -2603,14 +2614,14 @@ dependencies = [ "lru", "parking_lot", "pin-project", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "serde", "serde_json", "serde_qs", "subsecond", "thiserror 2.0.18", "tokio", - "tokio-tungstenite", + "tokio-tungstenite 0.28.0", "tokio-util", "tower", "tower-http", @@ -2622,37 +2633,37 @@ dependencies = [ [[package]] name = "dioxus-signals" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "409bf65d243443416650945f22cd6caf2a6bb13ae0347a50ec5852adb1961072" +checksum = "5393fd579f42c6547bf47ec0a2dedf2a366cd541d31deedc1059a096e6c35798" dependencies = [ "dioxus-core", "futures-channel", "futures-util", "generational-box", "parking_lot", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "tracing", "warnings", ] [[package]] name = "dioxus-ssr" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f16c0c648d1a650be65a16bc24a719519352ab94e6205cceaa300d9c9c5f88" +checksum = "d17c75a43e218012b63a97f55cf585747bd0ca37840ac2a0cf40add06ffcf9fe" dependencies = [ "askama_escape", "dioxus-core", "dioxus-core-types", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", ] [[package]] name = "dioxus-stores" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245ec4f84348e5be77451bd204181998b8bc0995b48ff3adb2db0e0ec430dab4" +checksum = "f1c59f52e8194439604dd35f8c540456c22b4a4b076930e424d0289f98ea3cb4" dependencies = [ "dioxus-core", "dioxus-signals", @@ -2662,9 +2673,9 @@ dependencies = [ [[package]] name = "dioxus-stores-macro" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9da8e9a1cc2d8bff387e0b99f09f2590b71f67d5d73ab343b2cc9d17990d92" +checksum = "737600865572cecf60ff934f88252bd4144f4ca189e0e570b7a780e8f0e01a1f" dependencies = [ "convert_case 0.8.0", "proc-macro2", @@ -2674,9 +2685,9 @@ dependencies = [ [[package]] name = "dioxus-web" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac92ef863bc5333440021e8ec3e538a39598c9c960daeaab66ab10ba940b5e0" +checksum = "176eb0a5ee8251203a816b413a64fdc014b43e53d4245f769b1b0c2035b88ac3" dependencies = [ "dioxus-cli-config", "dioxus-core", @@ -2694,7 +2705,7 @@ dependencies = [ "gloo-timers", "js-sys", "lazy-js-bundle", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "send_wrapper", "serde", "serde-wasm-bindgen", @@ -2706,16 +2717,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "dispatch2" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" -dependencies = [ - "bitflags 2.11.0", - "objc2", -] - [[package]] name = "displaydoc" version = "0.2.5" @@ -2883,9 +2884,9 @@ dependencies = [ [[package]] name = "euclid" -version = "0.22.13" +version = "0.22.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df61bf483e837f88d5c2291dcf55c67be7e676b3a51acc48db3a7b163b91ed63" +checksum = "f1a05365e3b1c6d1650318537c7460c6923f1abdd272ad6842baa2b509957a06" dependencies = [ "num-traits", "serde", @@ -2941,7 +2942,7 @@ dependencies = [ "fastrace-macro", "parking_lot", "pin-project", - "rand 0.9.2", + "rand 0.9.4", "rtrb", "serde", ] @@ -2985,9 +2986,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.3.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" [[package]] name = "find-msvc-tools" @@ -3019,7 +3020,7 @@ version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "rustc_version", ] @@ -3063,11 +3064,11 @@ dependencies = [ [[package]] name = "fsst-rs" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdf65e16e100438be0030d113042e07a62bed67203998640ca6fae0404eed71e" +checksum = "3bf53d7c403a2b76873d4d66ba7d79c54bde2784cdaba6083f223d6e33270708" dependencies = [ - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", ] [[package]] @@ -3166,29 +3167,14 @@ dependencies = [ [[package]] name = "generational-box" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ede46ff252793f9b6ef752c506ba8600c69d73cad2ef9bbf2e6dee85019a3bc" +checksum = "c68d74be1fbe3bba37604bdfd61403f26af9f6324cf325053abd89d60c22e799" dependencies = [ "parking_lot", "tracing", ] -[[package]] -name = "generator" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" -dependencies = [ - "cc", - "cfg-if", - "libc", - "log", - "rustversion", - "windows-link", - "windows-result", -] - [[package]] name = "generic-array" version = "0.14.7" @@ -3235,7 +3221,7 @@ dependencies = [ "cfg-if", "libc", "r-efi 6.0.0", - "rand_core 0.10.0", + "rand_core 0.10.1", "wasip2", "wasip3", ] @@ -3321,7 +3307,7 @@ dependencies = [ "futures-core", "futures-sink", "http", - "indexmap 2.13.0", + "indexmap 2.14.0", "slab", "tokio", "tokio-util", @@ -3372,6 +3358,12 @@ dependencies = [ "foldhash 0.2.0", ] +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + [[package]] name = "hdrhistogram" version = "7.5.4" @@ -3428,15 +3420,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "home" -version = "0.5.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "http" version = "1.4.0" @@ -3496,9 +3479,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" dependencies = [ "atomic-waker", "bytes", @@ -3511,7 +3494,6 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "pin-utils", "smallvec", "tokio", "want", @@ -3519,16 +3501,15 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.7" +version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ "http", "hyper", "hyper-util", "rustls", "rustls-native-certs", - "rustls-pki-types", "tokio", "tokio-rustls", "tower-service", @@ -3600,12 +3581,13 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "utf8_iter", "yoke", "zerofrom", "zerovec", @@ -3613,9 +3595,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", @@ -3626,9 +3608,9 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" dependencies = [ "icu_collections", "icu_normalizer_data", @@ -3640,15 +3622,15 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" [[package]] name = "icu_properties" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" dependencies = [ "icu_collections", "icu_locale_core", @@ -3660,15 +3642,15 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" [[package]] name = "icu_provider" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", @@ -3704,9 +3686,9 @@ dependencies = [ [[package]] name = "idna_adapter" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" dependencies = [ "icu_normalizer", "icu_properties", @@ -3724,12 +3706,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] @@ -3750,7 +3732,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" dependencies = [ "ahash", - "indexmap 2.13.0", + "indexmap 2.14.0", "is-terminal", "itoa", "log", @@ -3781,20 +3763,20 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "inventory" -version = "0.3.22" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "009ae045c87e7082cb72dab0ccd01ae075dd00141ddc108f43a0ea150a9e7227" +checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" dependencies = [ "rustversion", ] [[package]] name = "io-uring" -version = "0.7.11" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd7bddefd0a8833b88a4b68f90dae22c7450d11b354198baee3874fd811b344" +checksum = "4d09b98f7eace8982db770e4408e7470b028ce513ac28fecdc6bf4c30fe92b62" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "cfg-if", "libc", ] @@ -3807,9 +3789,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.10" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" dependencies = [ "memchr", "serde", @@ -3843,15 +3825,15 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jiff" -version = "0.2.23" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +checksum = "f00b5dbd620d61dfdcb6007c9c1f6054ebd75319f163d886a9055cec1155073d" dependencies = [ "jiff-static", "jiff-tzdb-platform", @@ -3864,9 +3846,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.23" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +checksum = "e000de030ff8022ea1da3f466fbb0f3a809f5e51ed31f6dd931c35181ad8e6d7" dependencies = [ "proc-macro2", "quote", @@ -3897,7 +3879,7 @@ dependencies = [ "cesu8", "cfg-if", "combine", - "jni-sys", + "jni-sys 0.3.1", "log", "thiserror 1.0.69", "walkdir", @@ -3906,9 +3888,31 @@ dependencies = [ [[package]] name = "jni-sys" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" +checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" +dependencies = [ + "jni-sys 0.4.1", +] + +[[package]] +name = "jni-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" +dependencies = [ + "jni-sys-macros", +] + +[[package]] +name = "jni-sys-macros" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" +dependencies = [ + "quote", + "syn", +] [[package]] name = "jobserver" @@ -3922,40 +3926,46 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.91" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" dependencies = [ + "cfg-if", + "futures-util", "once_cell", "wasm-bindgen", ] [[package]] -name = "kani-verifier" -version = "0.67.0" +name = "keyboard-types" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6225a7ec13037e984f6bebdda33d34390e643a1cd2a4522cb5d5e8e6ed85e2b" +checksum = "b750dcadc39a09dbadd74e118f6dd6598df77fa01df0cfcdc52c28dece74528a" dependencies = [ - "anyhow", - "home", - "os_info", + "bitflags 2.11.1", + "serde", ] [[package]] -name = "keyboard-types" -version = "0.7.0" +name = "konst" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b750dcadc39a09dbadd74e118f6dd6598df77fa01df0cfcdc52c28dece74528a" +checksum = "128133ed7824fcd73d6e7b17957c5eb7bacb885649bd8c69708b2331a10bcefb" dependencies = [ - "bitflags 2.11.0", - "serde", + "konst_macro_rules", ] +[[package]] +name = "konst_macro_rules" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4933f3f57a8e9d9da04db23fb153356ecaf00cbd14aee46279c33dc80925c37" + [[package]] name = "lazy-js-bundle" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60d7adc10cb9440d17fa67e467febdfc98931338773d11bfee81809af54d0697" +checksum = "ebbde2c5796719fbd82d6b8ec0be3dacf1f70c2876dee0f2c001632794d6641f" [[package]] name = "lazy_static" @@ -4028,15 +4038,15 @@ dependencies = [ [[package]] name = "libbz2-rs-sys" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" +checksum = "b3a6a8c165077efc8f3a971534c50ea6a1a18b329ef4a66e897a7e3a1494565f" [[package]] name = "libc" -version = "0.2.183" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "libfuzzer-sys" @@ -4069,9 +4079,9 @@ dependencies = [ [[package]] name = "liblzma-sys" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" +checksum = "1a60851d15cd8c5346eca4ab8babff585be2ae4bc8097c067291d3ffe2add3b6" dependencies = [ "cc", "libc", @@ -4086,12 +4096,11 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libmimalloc-sys" -version = "0.1.44" +version = "0.1.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870" +checksum = "2d1eacfa31c33ec25e873c136ba5669f00f9866d0688bea7be4d3f7e43067df6" dependencies = [ "cc", - "libc", ] [[package]] @@ -4123,7 +4132,6 @@ dependencies = [ "fsst-rs", "futures", "insta", - "kani-verifier", "liquid-cache-common", "log", "mimalloc", @@ -4131,7 +4139,7 @@ dependencies = [ "object_store", "parquet", "parquet-variant-compute", - "rand 0.10.0", + "rand 0.10.1", "serde", "serde_json", "shuttle", @@ -4171,7 +4179,7 @@ dependencies = [ "perf-event2", "pprof", "regex", - "reqwest 0.13.2", + "reqwest 0.13.3", "serde", "serde_json", "sysinfo", @@ -4207,7 +4215,6 @@ dependencies = [ "ahash", "arrow", "arrow-schema", - "async-trait", "bytes", "datafusion", "divan", @@ -4218,10 +4225,8 @@ dependencies = [ "log", "object_store", "parquet", - "parquet-variant-compute", "parquet-variant-json", - "rand 0.10.0", - "serde", + "rand 0.10.1", "serde_json", "shuttle", "t4", @@ -4314,9 +4319,9 @@ dependencies = [ [[package]] name = "litemap" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" [[package]] name = "litrs" @@ -4408,9 +4413,9 @@ checksum = "b3bd0dd2cd90571056fdb71f6275fada10131182f84899f4b2a916e565d81d86" [[package]] name = "lru" -version = "0.16.3" +version = "0.16.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39" dependencies = [ "hashbrown 0.16.1", ] @@ -4443,9 +4448,9 @@ dependencies = [ [[package]] name = "manganis" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "492da8d77990281eabe6ded633e7b0cf805c5cf7a023a99abed8811edc872d6f" +checksum = "2e06225f29a781d86afdfafa562de09621f2ace377136ae2ae9ca9e72a29b920" dependencies = [ "const-serialize 0.7.2", "const-serialize 0.8.0-alpha.0", @@ -4459,9 +4464,9 @@ dependencies = [ [[package]] name = "manganis-core" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b84cc2951f3b119702fab499b9b1aec3f454929c62feca55b895b82c628308" +checksum = "774ddc382b4fb30f3fdcf2418131cbac5e6111f00e6c7ddf9e598ef3b2b4cd91" dependencies = [ "const-serialize 0.7.2", "const-serialize 0.8.0-alpha.0", @@ -4473,9 +4478,9 @@ dependencies = [ [[package]] name = "manganis-macro" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d2e60d36758b201b6ebb8a31aff6b013e58924eeb6d3cbf19aea764f51d69e4" +checksum = "731c83c89d831f341fb46eba0aefdfb3433f77a3f78a8b08d9d88746613a8f8b" dependencies = [ "dunce", "macro-string", @@ -4545,9 +4550,9 @@ dependencies = [ [[package]] name = "mimalloc" -version = "0.1.48" +version = "0.1.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1ee66a4b64c74f4ef288bcbb9192ad9c3feaad75193129ac8509af543894fd8" +checksum = "b3627c4272df786b9260cabaa46aec1d59c93ede723d4c3ef646c503816b0640" dependencies = [ "libmimalloc-sys", ] @@ -4586,9 +4591,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "wasi", @@ -4618,8 +4623,8 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3f42e7bbe13d351b6bead8286a43aac9534b82bd3cc43e47037f012ebfd62d4" dependencies = [ - "bitflags 2.11.0", - "jni-sys", + "bitflags 2.11.1", + "jni-sys 0.3.1", "log", "ndk-sys", "num_enum", @@ -4639,7 +4644,7 @@ version = "0.6.0+11769913" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee6cda3051665f1fb8d9e08fc35c96d5a244fb1be711a03b71118828afc9a873" dependencies = [ - "jni-sys", + "jni-sys 0.3.1", ] [[package]] @@ -4653,18 +4658,6 @@ dependencies = [ "libc", ] -[[package]] -name = "nix" -version = "0.30.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" -dependencies = [ - "bitflags 2.11.0", - "cfg-if", - "cfg_aliases", - "libc", -] - [[package]] name = "nom" version = "7.1.3" @@ -4714,9 +4707,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] name = "num-format" @@ -4749,9 +4742,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", "rustversion", @@ -4759,9 +4752,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -4778,69 +4771,13 @@ dependencies = [ "objc2-encode", ] -[[package]] -name = "objc2-cloud-kit" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17614fdcd9b411e6ff1117dfb1d0150f908ba83a7df81b1f118005fe0a8ea15d" -dependencies = [ - "bitflags 2.11.0", - "objc2", - "objc2-foundation", -] - -[[package]] -name = "objc2-core-data" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291fbbf7d29287518e8686417cf7239c74700fd4b607623140a7d4a3c834329d" -dependencies = [ - "objc2", - "objc2-foundation", -] - [[package]] name = "objc2-core-foundation" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ - "bitflags 2.11.0", - "dispatch2", - "objc2", -] - -[[package]] -name = "objc2-core-graphics" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "989c6c68c13021b5c2d6b71456ebb0f9dc78d752e86a98da7c716f4f9470f5a4" -dependencies = [ - "bitflags 2.11.0", - "dispatch2", - "objc2", - "objc2-core-foundation", - "objc2-io-surface", -] - -[[package]] -name = "objc2-core-image" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79b3dc0cc4386b6ccf21c157591b34a7f44c8e75b064f85502901ab2188c007e" -dependencies = [ - "objc2", - "objc2-foundation", -] - -[[package]] -name = "objc2-core-location" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac0f75792558aa9d618443bbb5db7426a7a0b6fddf96903f86ef9ad02e135740" -dependencies = [ - "objc2", - "objc2-foundation", + "bitflags 2.11.1", ] [[package]] @@ -4849,19 +4786,6 @@ version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" -[[package]] -name = "objc2-foundation" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "900831247d2fe1a09a683278e5384cfb8c80c79fe6b166f9d14bfdde0ea1b03c" -dependencies = [ - "bitflags 2.11.0", - "block2", - "libc", - "objc2", - "objc2-core-foundation", -] - [[package]] name = "objc2-io-kit" version = "0.3.2" @@ -4872,59 +4796,6 @@ dependencies = [ "objc2-core-foundation", ] -[[package]] -name = "objc2-io-surface" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7282e9ac92529fa3457ce90ebb15f4ecbc383e8338060960760fa2cf75420c3c" -dependencies = [ - "bitflags 2.11.0", - "objc2", - "objc2-core-foundation", -] - -[[package]] -name = "objc2-quartz-core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ffb6a0cd5f182dc964334388560b12a57f7b74b3e2dec5e2722aa2dfb2ccd5" -dependencies = [ - "bitflags 2.11.0", - "objc2", - "objc2-core-foundation", - "objc2-foundation", -] - -[[package]] -name = "objc2-ui-kit" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25b1312ad7bc8a0e92adae17aa10f90aae1fb618832f9b993b022b591027daed" -dependencies = [ - "bitflags 2.11.0", - "block2", - "objc2", - "objc2-cloud-kit", - "objc2-core-data", - "objc2-core-foundation", - "objc2-core-graphics", - "objc2-core-image", - "objc2-core-location", - "objc2-foundation", - "objc2-quartz-core", - "objc2-user-notifications", -] - -[[package]] -name = "objc2-user-notifications" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a3f5ec77a81d9e0c5a0b32159b0cb143d7086165e79708351e02bf37dfc65cd" -dependencies = [ - "objc2", - "objc2-foundation", -] - [[package]] name = "object" version = "0.37.3" @@ -4957,7 +4828,7 @@ dependencies = [ "parking_lot", "percent-encoding", "quick-xml 0.39.2", - "rand 0.10.0", + "rand 0.10.1", "reqwest 0.12.28", "ring", "serde", @@ -5060,7 +4931,7 @@ dependencies = [ "futures-util", "opentelemetry", "percent-encoding", - "rand 0.9.2", + "rand 0.9.4", "thiserror 2.0.18", ] @@ -5073,21 +4944,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "os_info" -version = "3.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4022a17595a00d6a369236fdae483f0de7f0a339960a53118b818238e132224" -dependencies = [ - "android_system_properties", - "log", - "nix 0.30.1", - "objc2", - "objc2-foundation", - "objc2-ui-kit", - "windows-sys 0.61.2", -] - [[package]] name = "owo-colors" version = "3.5.0" @@ -5165,7 +5021,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.13.0", + "indexmap 2.14.0", "simdutf8", "uuid", ] @@ -5180,7 +5036,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.13.0", + "indexmap 2.14.0", "parquet-variant", "parquet-variant-json", "serde_json", @@ -5219,7 +5075,7 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "575828d9d7d205188048eb1508560607a03d21eafdbba47b8cade1736c1c28e1" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "c-enum", "perf-event-open-sys2", ] @@ -5240,7 +5096,7 @@ version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0939b8fad77dfaeb29ebbd35faaeaadbf833167f30975f1b8993bbba09ea0a0f" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "c-enum", "libc", "memmap2", @@ -5299,7 +5155,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.13.0", + "indexmap 2.14.0", "serde", ] @@ -5347,17 +5203,11 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - [[package]] name = "pkg-config" -version = "0.3.32" +version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" [[package]] name = "plain" @@ -5379,18 +5229,18 @@ checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.5" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" dependencies = [ "portable-atomic", ] [[package]] name = "potential_utf" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" dependencies = [ "zerovec", ] @@ -5414,7 +5264,7 @@ dependencies = [ "inferno", "libc", "log", - "nix 0.26.4", + "nix", "once_cell", "smallvec", "spin 0.10.0", @@ -5434,9 +5284,9 @@ dependencies = [ [[package]] name = "pretty-hex" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbc83ee4a840062f368f9096d80077a9841ec117e17e7f700df81958f1451254" +checksum = "9a65843dfefbafd3c879c683306959a6de478443ffe9c9adf02f5976432402d7" [[package]] name = "prettyplease" @@ -5540,9 +5390,9 @@ checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" [[package]] name = "psm" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +checksum = "645dbe486e346d9b5de3ef16ede18c26e6c70ad97418f4874b8b1889d6e761ea" dependencies = [ "ar_archive_writer", "cc", @@ -5588,7 +5438,7 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "rustls", "socket2", "thiserror 2.0.18", @@ -5606,9 +5456,9 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", - "rustc-hash 2.1.1", + "rustc-hash 2.1.2", "rustls", "rustls-pki-types", "slab", @@ -5661,9 +5511,9 @@ checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" [[package]] name = "rand" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" dependencies = [ "libc", "rand_chacha 0.3.1", @@ -5672,9 +5522,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -5682,13 +5532,13 @@ dependencies = [ [[package]] name = "rand" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", "getrandom 0.4.2", - "rand_core 0.10.0", + "rand_core 0.10.1", ] [[package]] @@ -5731,9 +5581,9 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" +checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" [[package]] name = "rand_pcg" @@ -5776,7 +5626,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", ] [[package]] @@ -5863,9 +5713,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" +checksum = "62e0021ea2c22aed41653bc7e1419abb2c97e038ff2c33d0e1309e49a97deec0" dependencies = [ "base64 0.22.1", "bytes", @@ -5917,9 +5767,9 @@ dependencies = [ [[package]] name = "rtrb" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7204ed6420f698836b76d4d5c2ec5dec7585fd5c3a788fd1cde855d1de598239" +checksum = "4ade083ccbb4bf536df69d1f6432cc23deb7acccff86b183f3923a6fd56a1153" [[package]] name = "rustc-demangle" @@ -5935,9 +5785,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" [[package]] name = "rustc_version" @@ -5954,7 +5804,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys", @@ -5963,9 +5813,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.37" +version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ "once_cell", "ring", @@ -5989,9 +5839,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.14.0" +version = "1.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" dependencies = [ "web-time", "zeroize", @@ -5999,9 +5849,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "ring", "rustls-pki-types", @@ -6031,9 +5881,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.28" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" dependencies = [ "windows-sys 0.61.2", ] @@ -6076,7 +5926,7 @@ version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "core-foundation 0.10.1", "core-foundation-sys", "libc", @@ -6095,9 +5945,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "send_wrapper" @@ -6282,17 +6132,18 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "shuttle" -version = "0.8.1" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab17edba38d63047f46780cf7360acf7467fec2c048928689a5c1dd1c2b4e31" +checksum = "ba93071c1b720be2505f4c8ce2863502cb9a26a3819e268df1932458a755152c" dependencies = [ "assoc", "bitvec", "cfg-if", - "generator", + "const-siphasher", + "corosensei", "hex", "owo-colors", - "rand 0.8.5", + "rand 0.8.6", "rand_core 0.6.4", "rand_pcg", "scoped-tls", @@ -6302,9 +6153,9 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" [[package]] name = "simdutf8" @@ -6442,15 +6293,15 @@ checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "stacker" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +checksum = "640c8cdd92b6b12f5bcb1803ca3bbf5ab96e5e6b6b96b9ab77dabe9e880b3190" dependencies = [ "cc", "cfg-if", "libc", "psm", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -6467,9 +6318,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "subsecond" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dbb9f2928b6654ccc28d4ddfef5213e97ed66afed4907774d049b376c62a838" +checksum = "feae81a4a7ca6d0bcf70c385a43b7dbacbff527f0805cb0a4043ce2c2c559a2c" dependencies = [ "js-sys", "libc", @@ -6486,9 +6337,9 @@ dependencies = [ [[package]] name = "subsecond-types" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388bb28e6ddbee717745963b8932d9a6e24a5d3c93350655f733e938de04d81f" +checksum = "85256ee192cbdf00473e48e6133863b125dd4f772fddfbc97287ec7a61458c25" dependencies = [ "serde", ] @@ -6501,15 +6352,15 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "sval" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1aaf178a50bbdd86043fce9bf0a5867007d9b382db89d1c96ccae4601ff1ff9" +checksum = "2eb9318255ebd817902d7e279d8f8e39b35b1b9954decd5eb9ea0e30e5fd2b6a" [[package]] name = "sval_buffer" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f89273e48f03807ebf51c4d81c52f28d35ffa18a593edf97e041b52de143df89" +checksum = "12571299185e653fdb0fbfe36cd7f6529d39d4e747a60b15a3f34574b7b97c61" dependencies = [ "sval", "sval_ref", @@ -6517,18 +6368,18 @@ dependencies = [ [[package]] name = "sval_dynamic" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0430f4e18e7eba21a49d10d25a8dec3ce0e044af40b162347e99a8e3c3ced864" +checksum = "39526f24e997706c0de7f03fb7371f7f5638b66a504ded508e20ad173d0a3677" dependencies = [ "sval", ] [[package]] name = "sval_fmt" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "835f51b9d7331b9d7fc48fc716c02306fa88c4a076b1573531910c91a525882d" +checksum = "933dd3bb26965d682280fcc49400ac2a05036f4ee1e6dbd61bf8402d5a5c3a54" dependencies = [ "itoa", "ryu", @@ -6537,9 +6388,9 @@ dependencies = [ [[package]] name = "sval_json" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13cbfe3ef406ee2366e7e8ab3678426362085fa9eaedf28cb878a967159dced3" +checksum = "a0cda08f6d5c9948024a6551077557b1fdcc3880ff2f20ae839667d2ec2d87ed" dependencies = [ "itoa", "ryu", @@ -6548,9 +6399,9 @@ dependencies = [ [[package]] name = "sval_nested" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b20358af4af787c34321a86618c3cae12eabdd0e9df22cd9dd2c6834214c518" +checksum = "88d49d5e6c1f9fd0e53515819b03a97ca4eb1bff5c8ee097c43391c09ecfb19f" dependencies = [ "sval", "sval_buffer", @@ -6559,18 +6410,18 @@ dependencies = [ [[package]] name = "sval_ref" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5e500f8eb2efa84f75e7090f7fc43f621b9f8b6cde571c635b3855f97b332a" +checksum = "14f876c5a78405375b4e19cbb9554407513b59c93dea12dc6a4af4e1d30899ca" dependencies = [ "sval", ] [[package]] name = "sval_serde" -version = "2.17.0" +version = "2.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2032ae39b11dcc6c18d5fbc50a661ea191cac96484c59ccf49b002261ca2c1" +checksum = "5f9ccd3b7f7200239a655e517dd3fd48d960b9111ad24bd6a5e055bef17607c7" dependencies = [ "serde_core", "sval", @@ -6579,9 +6430,9 @@ dependencies = [ [[package]] name = "symbolic-common" -version = "12.17.2" +version = "12.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "751a2823d606b5d0a7616499e4130a516ebd01a44f39811be2b9600936509c23" +checksum = "332615d90111d8eeaf86a84dc9bbe9f65d0d8c5cf11b4caccedc37754eb0dcfd" dependencies = [ "debugid", "memmap2", @@ -6591,9 +6442,9 @@ dependencies = [ [[package]] name = "symbolic-demangle" -version = "12.17.2" +version = "12.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79b237cfbe320601dd24b4ac817a5b68bb28f5508e33f08d42be0682cadc8ac9" +checksum = "912017718eb4d21930546245af9a3475c9dccf15675a5c215664e76621afc471" dependencies = [ "cpp_demangle", "rustc-demangle", @@ -6651,7 +6502,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "core-foundation 0.9.4", "system-configuration-sys", ] @@ -6668,21 +6519,22 @@ dependencies = [ [[package]] name = "t4" -version = "0.1.3" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2586231b4196aa1217287f36db73b82afaf2e428d82824b395a806e0e7ae2b76" +checksum = "dea97b3f25826306b821d1c0b070709ef61d28e98c3ef1153de1048c33acf381" dependencies = [ "io-uring", "libc", "shuttle", "t4-verified", + "vstd", ] [[package]] name = "t4-verified" -version = "0.1.3" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f27bc679002d08d84ba2ee9e644ee2da132db2540e454b7a1f5b024e3e668089" +checksum = "a729c7595d4771ff781dcd1fdd5f65d14c8225dce6341ddf28792b292e6f5cdb" dependencies = [ "vstd", ] @@ -6708,12 +6560,12 @@ dependencies = [ [[package]] name = "terminal_size" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0" +checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix", - "windows-sys 0.60.2", + "windows-sys 0.59.0", ] [[package]] @@ -6828,9 +6680,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", "zerovec", @@ -6853,9 +6705,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.52.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" dependencies = [ "bytes", "libc", @@ -6868,9 +6720,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -6922,6 +6774,18 @@ dependencies = [ "tungstenite 0.28.0", ] +[[package]] +name = "tokio-tungstenite" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f72a05e828585856dacd553fba484c242c46e391fb0e58917c942ee9202915c" +dependencies = [ + "futures-util", + "log", + "tokio", + "tungstenite 0.29.0", +] + [[package]] name = "tokio-util" version = "0.7.18" @@ -6939,32 +6803,32 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.0.1+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.5+spec-1.1.0" +version = "0.25.11+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" +checksum = "0b59c4d22ed448339746c59b905d24568fcbb3ab65a500494f7b8c3e97739f2b" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.14.0", "toml_datetime", "toml_parser", - "winnow 1.0.0", + "winnow 1.0.2", ] [[package]] name = "toml_parser" -version = "1.0.10+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ - "winnow 1.0.0", + "winnow 1.0.2", ] [[package]] @@ -7015,7 +6879,7 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", - "indexmap 2.13.0", + "indexmap 2.14.0", "pin-project-lite", "slab", "sync_wrapper", @@ -7032,7 +6896,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "bytes", "futures-core", "futures-util", @@ -7166,7 +7030,7 @@ dependencies = [ "http", "httparse", "log", - "rand 0.9.2", + "rand 0.9.4", "sha1", "thiserror 2.0.18", "utf-8", @@ -7183,12 +7047,28 @@ dependencies = [ "http", "httparse", "log", - "rand 0.9.2", + "rand 0.9.4", "sha1", "thiserror 2.0.18", "utf-8", ] +[[package]] +name = "tungstenite" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c01152af293afb9c7c2a57e4b559c5620b421f6d133261c60dd2d0cdb38e6b8" +dependencies = [ + "bytes", + "data-encoding", + "http", + "httparse", + "log", + "rand 0.9.4", + "sha1", + "thiserror 2.0.18", +] + [[package]] name = "twox-hash" version = "2.1.2" @@ -7203,9 +7083,9 @@ checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" [[package]] name = "typenum" -version = "1.19.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" [[package]] name = "ucd-trie" @@ -7227,9 +7107,9 @@ checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" -version = "1.12.0" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" [[package]] name = "unicode-width" @@ -7343,9 +7223,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.23.0" +version = "1.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "getrandom 0.4.2", "js-sys", @@ -7403,15 +7283,15 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "verus_builtin" -version = "0.0.0-2026-02-08-0120" +version = "0.0.0-2026-04-12-0118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1bd13abb198fff161cf4be16cf8c1723f09941d03e323af71146fa4cfb0de0f" +checksum = "a46cb431066009ad2035f6bca936b1c2b7e293bffec93a2090fead0f35ab4276" [[package]] name = "verus_builtin_macros" -version = "0.0.0-2026-02-22-0103" +version = "0.0.0-2026-04-20-1748" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe750a656777699bfdcfee3894cc24f04b6f276e1240b2c9dbdeccdae4e5d94e" +checksum = "3131a0e0b0dc9272cb574ad1f8506d8e25ba33c9f9972151866c34cad12bb076" dependencies = [ "proc-macro2", "quote", @@ -7423,9 +7303,9 @@ dependencies = [ [[package]] name = "verus_prettyplease" -version = "0.0.0-2026-02-15-0106" +version = "0.0.0-2026-04-12-0118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa1c9a8d5635aac38528868312f72ed3f0e76f58ca25ac5c407d1064bdc33430" +checksum = "4b246e61b068e807cb05a030fc1d7efa83d2a0d227eaf67921661419edfe8e83" dependencies = [ "proc-macro2", "verus_syn", @@ -7433,9 +7313,9 @@ dependencies = [ [[package]] name = "verus_state_machines_macros" -version = "0.0.0-2026-02-15-0106" +version = "0.0.0-2026-04-20-1748" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28cb0ec7d66fc27adccbd5aa9d96acf3ab5508cd97f1ca8930fed4f272f6ba8" +checksum = "ba2497778c0b23d5cbd93f43cce512438af5cb01b1f1435aa7e411da8495efb5" dependencies = [ "indexmap 1.9.3", "proc-macro2", @@ -7445,9 +7325,9 @@ dependencies = [ [[package]] name = "verus_syn" -version = "0.0.0-2026-02-15-0106" +version = "0.0.0-2026-04-05-0114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fbe9986fe3ffe05a07a87834a6a2400792e52969813fa9660e313e1c9eda6b7" +checksum = "285b554a87b470ee705634ea1cdc92c14ba088a7b8bdacbb9116b32e832fe272" dependencies = [ "proc-macro2", "quote", @@ -7456,9 +7336,9 @@ dependencies = [ [[package]] name = "vstd" -version = "0.0.0-2026-02-22-0103" +version = "0.0.0-2026-04-20-1748" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b74abf6d04d21b8898894960d11dbe2489ab48be1f24eca92821b865616910c" +checksum = "8d03a77a2ee1b91bc776f8e9ee716d83b519d4338912116d7e855edfa3a8b976" dependencies = [ "verus_builtin", "verus_builtin_macros", @@ -7514,11 +7394,11 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.2+wasi-0.2.9" +version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", ] [[package]] @@ -7527,14 +7407,14 @@ version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.51.0", ] [[package]] name = "wasm-bindgen" -version = "0.2.114" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" dependencies = [ "cfg-if", "once_cell", @@ -7545,23 +7425,19 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.64" +version = "0.4.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8" dependencies = [ - "cfg-if", - "futures-util", "js-sys", - "once_cell", "wasm-bindgen", - "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.114" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7569,9 +7445,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.114" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" dependencies = [ "bumpalo", "proc-macro2", @@ -7582,9 +7458,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.114" +version = "0.2.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" dependencies = [ "unicode-ident", ] @@ -7606,7 +7482,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap 2.13.0", + "indexmap 2.14.0", "wasm-encoder", "wasmparser", ] @@ -7630,17 +7506,17 @@ version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "bitflags 2.11.0", + "bitflags 2.11.1", "hashbrown 0.15.5", - "indexmap 2.13.0", + "indexmap 2.14.0", "semver", ] [[package]] name = "web-sys" -version = "0.3.91" +version = "0.3.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" dependencies = [ "js-sys", "wasm-bindgen", @@ -7658,9 +7534,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" dependencies = [ "rustls-pki-types", ] @@ -7835,15 +7711,6 @@ dependencies = [ "windows-targets 0.52.6", ] -[[package]] -name = "windows-sys" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" -dependencies = [ - "windows-targets 0.53.5", -] - [[package]] name = "windows-sys" version = "0.61.2" @@ -7877,30 +7744,13 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", + "windows_i686_gnullvm", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] -[[package]] -name = "windows-targets" -version = "0.53.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" -dependencies = [ - "windows-link", - "windows_aarch64_gnullvm 0.53.1", - "windows_aarch64_msvc 0.53.1", - "windows_i686_gnu 0.53.1", - "windows_i686_gnullvm 0.53.1", - "windows_i686_msvc 0.53.1", - "windows_x86_64_gnu 0.53.1", - "windows_x86_64_gnullvm 0.53.1", - "windows_x86_64_msvc 0.53.1", -] - [[package]] name = "windows-threading" version = "0.2.1" @@ -7922,12 +7772,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" - [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -7940,12 +7784,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" - [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -7958,24 +7796,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" -[[package]] -name = "windows_i686_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" - [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" - [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -7988,12 +7814,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_i686_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" - [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -8006,12 +7826,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" - [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -8024,12 +7838,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" - [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -8042,12 +7850,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" - [[package]] name = "winnow" version = "0.7.15" @@ -8059,9 +7861,9 @@ dependencies = [ [[package]] name = "winnow" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" +checksum = "2ee1708bef14716a11bae175f579062d4554d95be2c6829f518df847b7b3fdd0" dependencies = [ "memchr", ] @@ -8075,6 +7877,12 @@ dependencies = [ "wit-bindgen-rust-macro", ] +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + [[package]] name = "wit-bindgen-core" version = "0.51.0" @@ -8094,7 +7902,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck", - "indexmap 2.13.0", + "indexmap 2.14.0", "prettyplease", "syn", "wasm-metadata", @@ -8124,8 +7932,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", - "bitflags 2.11.0", - "indexmap 2.13.0", + "bitflags 2.11.1", + "indexmap 2.14.0", "log", "serde", "serde_derive", @@ -8144,7 +7952,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap 2.13.0", + "indexmap 2.14.0", "log", "semver", "serde", @@ -8156,9 +7964,9 @@ dependencies = [ [[package]] name = "writeable" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "wyz" @@ -8177,9 +7985,9 @@ checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" [[package]] name = "yoke" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -8188,9 +7996,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", @@ -8200,18 +8008,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", @@ -8220,18 +8028,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", @@ -8247,9 +8055,9 @@ checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" dependencies = [ "displaydoc", "yoke", @@ -8258,9 +8066,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ "yoke", "zerofrom", @@ -8269,9 +8077,9 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index f5aea123..a6a31c64 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,15 +43,15 @@ parquet = { version = "58.1.0", features = [ ] } parquet-variant-json = { version = "58.1.0" } parquet-variant-compute = { version = "58.1.0" } -datafusion = { version = "53.0.0" } -datafusion-common = { version = "53.0.0" } -datafusion-expr-common = { version = "53.0.0" } -datafusion-physical-expr = { version = "53.0.0" } -datafusion-physical-expr-common = { version = "53.0.0" } -datafusion-proto = { version = "53.0.0" } +datafusion = { version = "53.1.0" } +datafusion-common = { version = "53.1.0" } +datafusion-expr-common = { version = "53.1.0" } +datafusion-physical-expr = { version = "53.1.0" } +datafusion-physical-expr-common = { version = "53.1.0" } +datafusion-proto = { version = "53.1.0" } async-trait = "0.1.89" futures = { version = "0.3.32", default-features = false, features = ["std"] } -tokio = { version = "1.50.0", features = ["rt-multi-thread"] } +tokio = { version = "1.52.1", features = ["rt-multi-thread"] } log = "0.4.29" tonic = { version = "0.14.5" } url = "2.5.8" @@ -62,12 +62,12 @@ object_store = { version = "0.13.2", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } tempfile = "3.27.0" -uuid = { version = "1.23.0", features = ["v4"] } +uuid = { version = "1.23.1", features = ["v4"] } fastrace = "0.7" fastrace-tonic = "0.2" congee = "0.4.1" insta = "1.47.2" -t4 = "0.1.3" +t4 = "0.1.6" [profile.dev.package] insta.opt-level = 3 diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml index 6180a248..a95464fd 100644 --- a/benchmark/Cargo.toml +++ b/benchmark/Cargo.toml @@ -16,9 +16,9 @@ tokio = { workspace = true } log = { workspace = true } arrow-flight = { workspace = true } tonic = { workspace = true } -clap = { version = "4.6.0", features = ["derive"] } +clap = { version = "4.6.1", features = ["derive"] } url = { workspace = true } -mimalloc = "0.1.48" +mimalloc = "0.1.50" serde_json.workspace = true serde.workspace = true sysinfo = { version = "0.38.4", default-features = false, features = [ @@ -35,8 +35,8 @@ opentelemetry = "0.31.0" opentelemetry_sdk = "0.31.0" opentelemetry-otlp = { version = "0.31.1", features = ["trace", "grpc-tonic"] } logforth = { version = "0.29.1", features = ["append-opentelemetry", "bridge-log"] } -reqwest = { version = "0.13.2", default-features = false, features = ["json"] } -uuid = { version = "1.23.0", features = ["v4"] } +reqwest = { version = "0.13.3", default-features = false, features = ["json"] } +uuid = { version = "1.23.1", features = ["v4"] } pprof = { version = "0.15.0", features = ["flamegraph"] } anyhow = "1.0" usdt = "0.6" diff --git a/benchmark/README.md b/benchmark/README.md index 5225d0b2..056ec543 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -112,7 +112,7 @@ cargo run --release --bin clickbench_client -- --manifest benchmark/stackoverflo cargo run --release --bin in_process -- \ --manifest benchmark/stackoverflow/manifest.json \ --bench-mode liquid \ - --max-cache-mb 256 + --max-memory-mb 256 ``` ## In process mode diff --git a/benchmark/bench_server.rs b/benchmark/bench_server.rs index 3f772f37..fbccae3c 100644 --- a/benchmark/bench_server.rs +++ b/benchmark/bench_server.rs @@ -27,9 +27,9 @@ struct CliArgs { #[arg(long = "abort-on-panic")] abort_on_panic: bool, - /// Maximum cache size in MB - #[arg(long = "max-cache-mb")] - max_cache_mb: Option, + /// Maximum memory size in MB + #[arg(long = "max-memory-mb")] + max_memory_mb: Option, /// Path to disk cache directory #[arg(long = "disk-cache-dir")] @@ -56,7 +56,7 @@ async fn main() -> Result<(), Box> { args.jaeger_endpoint.as_deref(), ); - let max_cache_bytes = args.max_cache_mb.map(|size| size * 1024 * 1024); + let max_memory_bytes = args.max_memory_mb.map(|size| size * 1024 * 1024); if args.abort_on_panic { // Be loud and crash loudly if any thread panics. @@ -73,7 +73,7 @@ async fn main() -> Result<(), Box> { let ctx = LiquidCacheService::context()?; let liquid_cache_datafusion_server = LiquidCacheService::new( ctx, - max_cache_bytes, + max_memory_bytes, args.disk_cache_dir.clone(), Box::new(LiquidPolicy::new()), squeeze_policy, diff --git a/benchmark/in_process.rs b/benchmark/in_process.rs index b85f80a6..4f123a76 100644 --- a/benchmark/in_process.rs +++ b/benchmark/in_process.rs @@ -46,9 +46,9 @@ struct InProcessBenchmark { #[arg(long)] pub partitions: Option, - /// Maximum cache size in bytes - #[arg(long = "max-cache-mb")] - pub max_cache_mb: Option, + /// Maximum memory size in MB + #[arg(long = "max-memory-mb")] + pub max_memory_mb: Option, /// Directory to write flamegraph SVG files to #[arg(long = "flamegraph-dir")] @@ -78,7 +78,7 @@ impl InProcessBenchmark { .with_reset_cache(self.reset_cache) .with_perf_events(self.perf_events) .with_partitions(self.partitions) - .with_max_cache_mb(self.max_cache_mb) + .with_max_memory_mb(self.max_memory_mb) .with_flamegraph_dir(self.flamegraph_dir.clone()) .with_cache_dir(self.cache_dir.clone()) .with_query_filter(self.query_index) diff --git a/benchmark/src/inprocess_runner.rs b/benchmark/src/inprocess_runner.rs index 31191a52..4ef6f57d 100644 --- a/benchmark/src/inprocess_runner.rs +++ b/benchmark/src/inprocess_runner.rs @@ -194,7 +194,7 @@ pub struct InProcessBenchmarkRunner { pub iteration: u32, pub reset_cache: bool, pub partitions: Option, - pub max_cache_mb: Option, + pub max_memory_mb: Option, pub flamegraph_dir: Option, pub query_filter: Option, pub cache_dir: Option, @@ -215,7 +215,7 @@ impl InProcessBenchmarkRunner { iteration: 3, reset_cache: false, partitions: None, - max_cache_mb: None, + max_memory_mb: None, flamegraph_dir: None, query_filter: None, cache_dir: None, @@ -249,8 +249,8 @@ impl InProcessBenchmarkRunner { self } - pub fn with_max_cache_mb(mut self, max_cache_mb: Option) -> Self { - self.max_cache_mb = max_cache_mb; + pub fn with_max_memory_mb(mut self, max_memory_mb: Option) -> Self { + self.max_memory_mb = max_memory_mb; self } @@ -294,7 +294,7 @@ impl InProcessBenchmarkRunner { } let cache_size = self - .max_cache_mb + .max_memory_mb .map(|size| size * 1024 * 1024) .unwrap_or(usize::MAX); @@ -323,7 +323,7 @@ impl InProcessBenchmarkRunner { } InProcessBenchmarkMode::Arrow => { let v = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(cache_size) + .with_max_memory_bytes(cache_size) .with_cache_dir(cache_dir) .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(NoHydration::new())) @@ -334,19 +334,18 @@ impl InProcessBenchmarkRunner { } InProcessBenchmarkMode::Liquid => { let v = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(cache_size) + .with_max_memory_bytes(cache_size) .with_cache_dir(cache_dir) .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(NoHydration::new())) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) - .with_eager_shredding(true) .build(session_config) .await?; (v.0, Some(v.1)) } InProcessBenchmarkMode::LiquidNoSqueeze => { let v = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(cache_size) + .with_max_memory_bytes(cache_size) .with_cache_dir(cache_dir) .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(NoHydration::new())) diff --git a/dev/README.md b/dev/README.md index 956007c1..72aa3c03 100644 --- a/dev/README.md +++ b/dev/README.md @@ -59,7 +59,7 @@ sudo bpftrace -e ' delete(@t[args->user_data]); } ' \ --c 'target/release/in_process --manifest benchmark/clickbench/manifest.json --bench-mode liquid-no-squeeze --max-cache-mb 128 --query-index 20 --io-mode uring' +-c 'target/release/in_process --manifest benchmark/clickbench/manifest.json --bench-mode liquid-no-squeeze --max-memory-mb 128 --query-index 20 --io-mode uring' ``` This will trace the execution of `iteration = 2` (`arg1 == 2`) and print the `io_uring` latency in us (from submission to completion) histogram: ``` @@ -90,7 +90,7 @@ tracepoint:syscalls:sys_exit_read /@go==1 && @s[tid]/ { @r = hist((nsecs-@s[ tracepoint:syscalls:sys_exit_pread64 /@go==1 && @s[tid]/ { @r = hist((nsecs-@s[tid])/1000); delete(@s[tid]); } tracepoint:syscalls:sys_exit_write /@go==1 && @s[tid]/ { @w = hist((nsecs-@s[tid])/1000); delete(@s[tid]); } tracepoint:syscalls:sys_exit_pwrite64 /@go==1 && @s[tid]/ { @w = hist((nsecs-@s[tid])/1000); delete(@s[tid]); } -' -c 'target/release/in_process --manifest benchmark/clickbench/manifest.json --bench-mode liquid-no-squeeze --max-cache-mb 128 --query-index 20 --io-mode std-blocking' +' -c 'target/release/in_process --manifest benchmark/clickbench/manifest.json --bench-mode liquid-no-squeeze --max-memory-mb 128 --query-index 20 --io-mode std-blocking' ``` It will generate: diff --git a/dev/dev-tools/Cargo.toml b/dev/dev-tools/Cargo.toml index 75502557..387a086f 100644 --- a/dev/dev-tools/Cargo.toml +++ b/dev/dev-tools/Cargo.toml @@ -6,7 +6,7 @@ edition = "2024" [dependencies] -dioxus = { version = "0.7.4", features = ["router", "fullstack"] } +dioxus = { version = "=0.7.5", features = ["router", "fullstack"] } [features] default = ["web"] diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 7903fd7a..3c9f4aaa 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -41,5 +41,5 @@ arrow-flight = { workspace = true } tonic = { workspace = true } url = { workspace = true } tempfile = "3.27.0" -clap = { version = "4.6.0", features = ["derive"] } +clap = { version = "4.6.1", features = ["derive"] } object_store = { workspace = true, features = ["http"] } diff --git a/examples/example_inprocess_cache_eviction.rs b/examples/example_inprocess_cache_eviction.rs index 8f3c493a..9ce476dc 100644 --- a/examples/example_inprocess_cache_eviction.rs +++ b/examples/example_inprocess_cache_eviction.rs @@ -16,7 +16,7 @@ async fn main() -> Result<(), Box> { println!("{:?}", temp_dir); let (_ctx, storage) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 1024) // 1MB + .with_max_memory_bytes(1024 * 1024) // 1MB .with_cache_dir(temp_dir.path().to_path_buf()) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_cache_policy(Box::new(LiquidPolicy::new())) @@ -30,7 +30,8 @@ async fn main() -> Result<(), Box> { storage .storage() .insert(entry_id, arrow_array.clone()) - .await; + .await + .unwrap(); let _ = storage.storage().get(&entry_id).await.unwrap(); } println!("{:?}", storage.storage().stats()); diff --git a/examples/example_inprocess_insertion.rs b/examples/example_inprocess_insertion.rs index 7605d50f..9b4dcc95 100644 --- a/examples/example_inprocess_insertion.rs +++ b/examples/example_inprocess_insertion.rs @@ -14,7 +14,7 @@ async fn main() -> Result<(), Box> { let temp_dir = TempDir::new().unwrap(); let (_ctx, storage) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 1024) // 1MB + .with_max_memory_bytes(1024 * 1024) // 1MB .with_cache_dir(temp_dir.path().to_path_buf()) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_cache_policy(Box::new(LiquidPolicy::new())) @@ -28,7 +28,8 @@ async fn main() -> Result<(), Box> { storage .storage() .insert(entry_id, arrow_array.clone()) - .await; + .await + .unwrap(); assert!(storage.storage().is_cached(&entry_id)); diff --git a/examples/example_inprocess_read.rs b/examples/example_inprocess_read.rs index f2b874ed..7b5efda3 100644 --- a/examples/example_inprocess_read.rs +++ b/examples/example_inprocess_read.rs @@ -9,10 +9,10 @@ async fn main() -> Result<(), Box> { let entry_id = EntryID::from(7); let arrow_array = Arc::new(UInt64Array::from_iter_values(0..16)); - storage.insert(entry_id, arrow_array.clone()).await; + storage.insert(entry_id, arrow_array.clone()).await.unwrap(); // Move data to disk so the read will demonstrate async I/O - storage.flush_all_to_disk().await; + storage.flush_all_to_disk().await.unwrap(); // Read asynchronously let retrieved = storage.get(&entry_id).await.unwrap(); diff --git a/examples/example_local.rs b/examples/example_local.rs index d060703c..38577fab 100644 --- a/examples/example_local.rs +++ b/examples/example_local.rs @@ -9,7 +9,7 @@ async fn main() -> Result<(), Box> { let temp_dir = TempDir::new().unwrap(); let (ctx, _) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 1024 * 1024) // 1GB + .with_max_memory_bytes(1024 * 1024 * 1024) // 1GB .with_cache_dir(temp_dir.path().to_path_buf()) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_cache_policy(Box::new(LiquidPolicy::new())) diff --git a/examples/example_server.rs b/examples/example_server.rs index f6b15303..67872ac7 100644 --- a/examples/example_server.rs +++ b/examples/example_server.rs @@ -1,18 +1,17 @@ use arrow_flight::flight_service_server::FlightServiceServer; use datafusion::prelude::SessionContext; -use liquid_cache_datafusion_local::storage::cache::AlwaysHydrate; use liquid_cache_datafusion_local::storage::cache::squeeze_policies::TranscodeSqueezeEvict; +use liquid_cache_datafusion_local::storage::cache::{AlwaysHydrate, LiquidPolicy}; use liquid_cache_datafusion_server::LiquidCacheService; -use liquid_cache_datafusion_server::storage::cache_policies::LruPolicy; use tonic::transport::Server; #[tokio::main] async fn main() -> Result<(), Box> { let liquid_cache = LiquidCacheService::new( SessionContext::new(), - Some(1024 * 1024 * 1024), // max memory cache size 1GB + Some(1024 * 1024 * 1024), // max memory size 1GB Some(tempfile::tempdir()?.keep()), // disk cache dir - Box::new(LruPolicy::new()), + Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), Box::new(AlwaysHydrate::new()), ) diff --git a/flake.lock b/flake.lock index 6aedd304..91d09a14 100644 --- a/flake.lock +++ b/flake.lock @@ -1,20 +1,5 @@ { "nodes": { - "crane": { - "locked": { - "lastModified": 1774313767, - "narHash": "sha256-hy0XTQND6avzGEUFrJtYBBpFa/POiiaGBr2vpU6Y9tY=", - "owner": "ipetkov", - "repo": "crane", - "rev": "3d9df76e29656c679c744968b17fbaf28f0e923d", - "type": "github" - }, - "original": { - "owner": "ipetkov", - "repo": "crane", - "type": "github" - } - }, "flake-utils": { "inputs": { "systems": "systems" @@ -35,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1775036866, - "narHash": "sha256-ZojAnPuCdy657PbTq5V0Y+AHKhZAIwSIT2cb8UgAz/U=", + "lastModified": 1777268161, + "narHash": "sha256-bxrdOn8SCOv8tN4JbTF/TXq7kjo9ag4M+C8yzzIRYbE=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "6201e203d09599479a3b3450ed24fa81537ebc4e", + "rev": "1c3fe55ad329cbcb28471bb30f05c9827f724c76", "type": "github" }, "original": { @@ -67,7 +52,6 @@ }, "root": { "inputs": { - "crane": "crane", "flake-utils": "flake-utils", "nixpkgs": "nixpkgs", "rust-overlay": "rust-overlay" @@ -78,11 +62,11 @@ "nixpkgs": "nixpkgs_2" }, "locked": { - "lastModified": 1775099554, - "narHash": "sha256-3xBsGnGDLOFtnPZ1D3j2LU19wpAlYefRKTlkv648rU0=", + "lastModified": 1777432579, + "narHash": "sha256-Ce11TStDsqCge2vAAfLKe2+4lDI5cSX5ZYZOuKJBKKQ=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "8d6387ed6d8e6e6672fd3ed4b61b59d44b124d99", + "rev": "3ecb5e6ab380ced3272ef7fcfe398bffbcc0f152", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 327fbe8f..ef8360ae 100644 --- a/flake.nix +++ b/flake.nix @@ -5,14 +5,12 @@ nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; rust-overlay.url = "github:oxalica/rust-overlay"; flake-utils.url = "github:numtide/flake-utils"; - crane.url = "github:ipetkov/crane"; }; outputs = { nixpkgs , rust-overlay , flake-utils - , crane , ... }: flake-utils.lib.eachDefaultSystem ( @@ -22,26 +20,14 @@ pkgs = import nixpkgs { inherit system overlays; }; - craneLib = crane.mkLib pkgs; - kaniVerifier = craneLib.buildPackage { - pname = "kani-verifier"; - version = "0.67.0"; - src = craneLib.downloadCargoPackage { - name = "kani-verifier"; - version = "0.67.0"; - source = "registry+https://github.com/rust-lang/crates.io-index"; - checksum = "sha256-1iJafsEwN+mE9r692jPTQ5DmQ6HNKkUiy11ejm7YXis="; - }; - doCheck = false; - }; # Fetch daisyUI bundle files daisyui-bundle = pkgs.fetchurl { - url = "https://github.com/saadeghi/daisyui/releases/latest/download/daisyui.mjs"; - sha256 = "sha256-dH6epo+aSV+eeh3uQbxd7MkWlG+6hCaGaknQ4Bnljj4="; + url = "https://github.com/saadeghi/daisyui/releases/download/v5.5.19/daisyui.mjs"; + sha256 = "sha256-X+Q/9eg8XPUZzMMtdqoagu1r/FDuPm9dxgB+6mI5rx8="; }; daisyui-theme-bundle = pkgs.fetchurl { - url = "https://github.com/saadeghi/daisyui/releases/latest/download/daisyui-theme.mjs"; - sha256 = "sha256-iiUODarjHRxAD+tyOPh95xhHJELC40oczt+dsDo86yE="; + url = "https://github.com/saadeghi/daisyui/releases/download/v5.5.19/daisyui-theme.mjs"; + sha256 = "sha256-tAcb7y5ZvYNQllnB5ybMGXBKH9FP8uVtR5vBampT8m0="; }; in { @@ -52,7 +38,6 @@ pkg-config eza fd - kaniVerifier llvmPackages.bintools lldb cargo-fuzz @@ -64,7 +49,7 @@ nodejs tailwindcss_4 dioxus-cli - wasm-bindgen-cli_0_2_108 + wasm-bindgen-cli_0_2_118 binaryen (rust-bin.selectLatestNightlyWith (toolchain: toolchain.default.override { extensions = [ "rust-src" "llvm-tools-preview" ]; diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index c7a6cca3..214479a4 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -7,10 +7,6 @@ readme = "README.md" description = { workspace = true } repository = { workspace = true } -[package.metadata.cargo-shear] -ignored = ["kani-verifier"] - - [dependencies] async-stream = "0.3.6" async-trait = { workspace = true } @@ -27,7 +23,7 @@ arrow = { workspace = true } arrow-schema = { workspace = true } fastlanes = "0.5.0" num-traits = "0.2.19" -fsst-rs = "0.5.9" +fsst-rs = "0.5.10" ahash = { workspace = true } tempfile = { workspace = true } congee = { workspace = true } @@ -40,26 +36,18 @@ serde = { workspace = true } [dev-dependencies] tempfile = { workspace = true } -shuttle = "0.8.1" +shuttle = "0.9.1" tokio-test = "0.4" tracing-subscriber = "0.3.23" -rand = "0.10.0" +rand = "0.10.1" serde.workspace = true serde_json = { workspace = true } -mimalloc = "0.1.48" -clap = { version = "4.6.0", features = ["derive"] } +mimalloc = "0.1.50" +clap = { version = "4.6.1", features = ["derive"] } divan = "0.1.21" -kani-verifier = "0.67.0" insta = { workspace = true } datafusion = { workspace = true } -[lints.rust] -unexpected_cfgs = { level = "warn", check-cfg = [ - 'cfg(rust_analyzer)', - 'cfg(kani)', -] } - - [features] shuttle = ["t4/shuttle"] diff --git a/src/core/src/cache/budget.rs b/src/core/src/cache/budget.rs index 921b388f..2b7bc5d0 100644 --- a/src/core/src/cache/budget.rs +++ b/src/core/src/cache/budget.rs @@ -1,18 +1,30 @@ -use crate::sync::atomic::{AtomicUsize, Ordering}; +use super::observer::Observer; +use crate::sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, +}; #[derive(Debug)] pub struct BudgetAccounting { max_memory_bytes: usize, + max_disk_bytes: usize, used_memory_bytes: AtomicUsize, used_disk_bytes: AtomicUsize, + observer: Arc, } impl BudgetAccounting { - pub(super) fn new(max_memory_bytes: usize) -> Self { + pub(super) fn new( + max_memory_bytes: usize, + max_disk_bytes: usize, + observer: Arc, + ) -> Self { Self { max_memory_bytes, + max_disk_bytes, used_memory_bytes: AtomicUsize::new(0), used_disk_bytes: AtomicUsize::new(0), + observer, } } @@ -21,8 +33,8 @@ impl BudgetAccounting { self.used_disk_bytes.store(0, Ordering::Relaxed); } - /// Try to reserve space in the cache. - /// Returns ok if the space was reserved, err if the cache is full. + /// Try to reserve memory in the cache. + /// Returns ok if the memory was reserved, err if the memory budget is full. pub(super) fn try_reserve_memory(&self, request_bytes: usize) -> Result<(), ()> { let used = self.used_memory_bytes.load(Ordering::Relaxed); if used + request_bytes > self.max_memory_bytes { @@ -40,8 +52,8 @@ impl BudgetAccounting { } } - /// Adjust the cache size after transcoding. - /// Returns true if the size was adjusted, false if the cache is full, when new_size is larger than old_size. + /// Adjust memory usage after transcoding. + /// Returns ok if the usage was adjusted, err if the memory budget is full when new_size is larger than old_size. pub(super) fn try_update_memory_usage( &self, old_size: usize, @@ -66,8 +78,26 @@ impl BudgetAccounting { self.used_disk_bytes.load(Ordering::Relaxed) } - pub fn add_used_disk_bytes(&self, bytes: usize) { - self.used_disk_bytes.fetch_add(bytes, Ordering::Relaxed); + pub(super) fn try_reserve_disk(&self, request_bytes: usize) -> Result<(), ()> { + let used = self.used_disk_bytes.load(Ordering::Relaxed); + if used + request_bytes > self.max_disk_bytes { + self.observer.on_disk_reservation_failure(); + return Err(()); + } + + match self.used_disk_bytes.compare_exchange( + used, + used + request_bytes, + Ordering::Relaxed, + Ordering::Relaxed, + ) { + Ok(_) => Ok(()), + Err(_) => self.try_reserve_disk(request_bytes), + } + } + + pub(super) fn release_disk(&self, bytes: usize) { + self.used_disk_bytes.fetch_sub(bytes, Ordering::Relaxed); } } @@ -76,9 +106,13 @@ mod tests { use super::*; use crate::sync::{Arc, Barrier, thread}; + fn test_budget(max_memory_bytes: usize, max_disk_bytes: usize) -> BudgetAccounting { + BudgetAccounting::new(max_memory_bytes, max_disk_bytes, Arc::new(Observer::new())) + } + #[test] fn test_memory_reservation_and_accounting() { - let config = BudgetAccounting::new(1000); + let config = test_budget(1000, usize::MAX); assert_eq!(config.memory_usage_bytes(), 0); @@ -111,7 +145,7 @@ mod tests { let max_memory = 10000; let operations_per_thread = 100; - let budget = Arc::new(BudgetAccounting::new(max_memory)); + let budget = Arc::new(test_budget(max_memory, usize::MAX)); let barrier = Arc::new(Barrier::new(num_threads)); let mut thread_handles = vec![]; @@ -165,4 +199,22 @@ mod tests { assert_eq!(budget.memory_usage_bytes(), expected_memory_usage); assert!(budget.memory_usage_bytes() <= max_memory); } + + #[test] + fn disk_reservation_and_release() { + let budget = test_budget(usize::MAX, 1000); + + assert_eq!(budget.disk_usage_bytes(), 0); + assert!(budget.try_reserve_disk(400).is_ok()); + assert_eq!(budget.disk_usage_bytes(), 400); + assert!(budget.try_reserve_disk(600).is_ok()); + assert_eq!(budget.disk_usage_bytes(), 1000); + assert!(budget.try_reserve_disk(1).is_err()); + assert_eq!(budget.disk_usage_bytes(), 1000); + + budget.release_disk(250); + assert_eq!(budget.disk_usage_bytes(), 750); + assert!(budget.try_reserve_disk(250).is_ok()); + assert_eq!(budget.disk_usage_bytes(), 1000); + } } diff --git a/src/core/src/cache/builders.rs b/src/core/src/cache/builders.rs index f3d35862..4d4f9cf7 100644 --- a/src/core/src/cache/builders.rs +++ b/src/core/src/cache/builders.rs @@ -8,9 +8,9 @@ use arrow::buffer::BooleanBuffer; use super::cached_batch::CacheEntry; use super::core::LiquidCache; -use super::io_context::{DefaultIoContext, IoContext}; +use super::io_context::{DefaultCacheMetadata, EntryMetadata}; use super::policies::{CachePolicy, HydrationPolicy, SqueezePolicy, TranscodeSqueezeEvict}; -use super::{CacheExpression, EntryID, LiquidExpr, LiquidPolicy}; +use super::{CacheExpression, CacheFull, EntryID, LiquidExpr, LiquidPolicy}; use crate::sync::Arc; /// Builder for [LiquidCache]. @@ -23,7 +23,7 @@ use crate::sync::Arc; /// tokio_test::block_on(async { /// let _storage = LiquidCacheBuilder::new() /// .with_batch_size(8192) -/// .with_max_cache_bytes(1024 * 1024 * 1024) +/// .with_max_memory_bytes(1024 * 1024 * 1024) /// .with_cache_policy(Box::new(LiquidPolicy::new())) /// .build() /// .await; @@ -31,11 +31,14 @@ use crate::sync::Arc; /// ``` pub struct LiquidCacheBuilder { batch_size: usize, - max_cache_bytes: usize, + max_memory_bytes: usize, + max_disk_bytes: usize, cache_policy: Box, hydration_policy: Box, squeeze_policy: Box, - io_context: Option>, + metadata: Option>, + store: Option, + squeeze_victims_concurrently: bool, } impl Default for LiquidCacheBuilder { @@ -49,11 +52,14 @@ impl LiquidCacheBuilder { pub fn new() -> Self { Self { batch_size: 8192, - max_cache_bytes: 1024 * 1024 * 1024, + max_memory_bytes: 1024 * 1024 * 1024, + max_disk_bytes: usize::MAX, cache_policy: Box::new(LiquidPolicy::new()), hydration_policy: Box::new(super::AlwaysHydrate::new()), squeeze_policy: Box::new(TranscodeSqueezeEvict), - io_context: None, + metadata: None, + store: None, + squeeze_victims_concurrently: !cfg!(test), } } @@ -64,10 +70,17 @@ impl LiquidCacheBuilder { self } - /// Set the max cache bytes for the cache. + /// Set the max memory bytes for the cache. /// Default is 1GB. - pub fn with_max_cache_bytes(mut self, max_cache_bytes: usize) -> Self { - self.max_cache_bytes = max_cache_bytes; + pub fn with_max_memory_bytes(mut self, max_memory_bytes: usize) -> Self { + self.max_memory_bytes = max_memory_bytes; + self + } + + /// Set the max disk bytes for the cache. + /// Default is unlimited. + pub fn with_max_disk_bytes(mut self, max_disk_bytes: usize) -> Self { + self.max_disk_bytes = max_disk_bytes; self } @@ -92,37 +105,54 @@ impl LiquidCacheBuilder { self } - /// Set the [IoContext] for the cache. - /// Default is [DefaultIoContext]. - pub fn with_io_context(mut self, io_context: Arc) -> Self { - self.io_context = Some(io_context); + /// Set the [EntryMetadata] for the cache. + /// Default is [DefaultCacheMetadata]. + pub fn with_metadata(mut self, metadata: Arc) -> Self { + self.metadata = Some(metadata); + self + } + + /// Set the [`t4::Store`] used for on-disk IO. + /// If not provided, the builder mounts a fresh store at a temporary directory. + pub fn with_store(mut self, store: t4::Store) -> Self { + self.store = Some(store); + self + } + + /// Set whether cache victims are squeezed concurrently. + pub fn with_squeeze_victims_concurrently(mut self, enabled: bool) -> Self { + self.squeeze_victims_concurrently = enabled; self } /// Build the cache storage. /// /// The cache storage is wrapped in an [Arc] to allow for concurrent access. - /// When no custom [IoContext] is provided, a [`t4::Store`] is mounted at a - /// temporary directory. + /// When no [`t4::Store`] is provided, one is mounted at a temporary directory. pub async fn build(self) -> Arc { - let io_worker = match self.io_context { - Some(io_context) => io_context, + let store = match self.store { + Some(store) => store, None => { let cache_dir = tempfile::tempdir().unwrap().keep(); let store_path = cache_dir.join("liquid_cache.t4"); - let store = t4::mount(&store_path) + t4::mount(&store_path) .await - .expect("failed to mount t4 store"); - Arc::new(DefaultIoContext::new(store)) + .expect("failed to mount t4 store") } }; + let metadata = self + .metadata + .unwrap_or_else(|| Arc::new(DefaultCacheMetadata::new())); Arc::new(LiquidCache::new( self.batch_size, - self.max_cache_bytes, + self.max_memory_bytes, + self.max_disk_bytes, self.squeeze_policy, self.cache_policy, self.hydration_policy, - io_worker, + metadata, + store, + self.squeeze_victims_concurrently, )) } } @@ -160,7 +190,7 @@ impl<'a> Insert<'a> { self } - async fn run(self) { + async fn run(self) -> Result<(), CacheFull> { let batch = if self.skip_gc { self.batch.clone() } else { @@ -170,13 +200,13 @@ impl<'a> Insert<'a> { self.storage.add_squeeze_hint(&self.entry_id, squeeze_hint); } let batch = CacheEntry::memory_arrow(batch); - self.storage.insert_inner(self.entry_id, batch).await; + self.storage.insert_inner(self.entry_id, batch).await } } impl<'a> IntoFuture for Insert<'a> { - type Output = (); - type IntoFuture = Pin + Send + 'a>>; + type Output = Result<(), CacheFull>; + type IntoFuture = Pin> + Send + 'a>>; fn into_future(self) -> Self::IntoFuture { Box::pin(async move { self.run().await }) @@ -409,7 +439,7 @@ mod tests { let cache = LiquidCacheBuilder::new().build().await; let entry_id = EntryID::from(123usize); - cache.insert(entry_id, root.clone()).await; + cache.insert(entry_id, root.clone()).await.unwrap(); let stored = cache.get(&entry_id).await.expect("array present"); let post_size = stored.get_array_memory_size(); diff --git a/src/core/src/cache/cached_batch.rs b/src/core/src/cache/cached_batch.rs index 62b5b811..fda9950a 100644 --- a/src/core/src/cache/cached_batch.rs +++ b/src/core/src/cache/cached_batch.rs @@ -17,9 +17,19 @@ pub enum CacheEntry { /// Cached batch in memory as squeezed liquid array. MemorySqueezedLiquid(LiquidSqueezedArrayRef), /// Cached batch on disk as liquid array. - DiskLiquid(DataType), + DiskLiquid { + /// Original Arrow data type. + data_type: DataType, + /// Byte length of the persisted backing data. + disk_bytes: usize, + }, /// Cached batch on disk as Arrow array. - DiskArrow(DataType), + DiskArrow { + /// Original Arrow data type. + data_type: DataType, + /// Byte length of the persisted backing data. + disk_bytes: usize, + }, } impl CacheEntry { @@ -39,13 +49,19 @@ impl CacheEntry { } /// Construct a cached batch stored on disk as Liquid bytes. - pub fn disk_liquid(data_type: DataType) -> Self { - Self::DiskLiquid(data_type) + pub fn disk_liquid(data_type: DataType, disk_bytes: usize) -> Self { + Self::DiskLiquid { + data_type, + disk_bytes, + } } /// Construct a cached batch stored on disk as Arrow bytes. - pub fn disk_arrow(data_type: DataType) -> Self { - Self::DiskArrow(data_type) + pub fn disk_arrow(data_type: DataType, disk_bytes: usize) -> Self { + Self::DiskArrow { + data_type, + disk_bytes, + } } /// Memory usage reported by the underlying representation. @@ -54,7 +70,7 @@ impl CacheEntry { Self::MemoryArrow(array) => array.get_array_memory_size(), Self::MemoryLiquid(array) => array.get_array_memory_size(), Self::MemorySqueezedLiquid(array) => array.get_array_memory_size(), - Self::DiskLiquid(_) | Self::DiskArrow(_) => 0, + Self::DiskLiquid { .. } | Self::DiskArrow { .. } => 0, } } @@ -64,7 +80,7 @@ impl CacheEntry { Self::MemoryArrow(array) => Arc::strong_count(array), Self::MemoryLiquid(array) => Arc::strong_count(array), Self::MemorySqueezedLiquid(array) => Arc::strong_count(array), - Self::DiskLiquid(_) | Self::DiskArrow(_) => 0, + Self::DiskLiquid { .. } | Self::DiskArrow { .. } => 0, } } } @@ -75,8 +91,8 @@ impl Display for CacheEntry { Self::MemoryArrow(_) => write!(f, "MemoryArrow"), Self::MemoryLiquid(_) => write!(f, "MemoryLiquid"), Self::MemorySqueezedLiquid(_) => write!(f, "MemorySqueezedLiquid"), - Self::DiskLiquid(_) => write!(f, "DiskLiquid"), - Self::DiskArrow(_) => write!(f, "DiskArrow"), + Self::DiskLiquid { .. } => write!(f, "DiskLiquid"), + Self::DiskArrow { .. } => write!(f, "DiskArrow"), } } } @@ -102,8 +118,8 @@ impl From<&CacheEntry> for CachedBatchType { CacheEntry::MemoryArrow(_) => Self::MemoryArrow, CacheEntry::MemoryLiquid(_) => Self::MemoryLiquid, CacheEntry::MemorySqueezedLiquid(_) => Self::MemorySqueezedLiquid, - CacheEntry::DiskLiquid(_) => Self::DiskLiquid, - CacheEntry::DiskArrow(_) => Self::DiskArrow, + CacheEntry::DiskLiquid { .. } => Self::DiskLiquid, + CacheEntry::DiskArrow { .. } => Self::DiskArrow, } } } diff --git a/src/core/src/cache/core.rs b/src/core/src/cache/core.rs index 7a5d76df..84e0a795 100644 --- a/src/core/src/cache/core.rs +++ b/src/core/src/cache/core.rs @@ -2,7 +2,7 @@ use arrow::array::cast::AsArray; use arrow::array::{ArrayRef, BooleanArray}; use arrow::buffer::BooleanBuffer; use arrow::record_batch::RecordBatch; -use arrow_schema::{DataType, Field, Schema}; +use arrow_schema::{Field, Schema}; use bytes::Bytes; use futures::StreamExt; @@ -10,16 +10,16 @@ use super::{ budget::BudgetAccounting, builders::{EvaluatePredicate, Get, Insert}, cached_batch::{CacheEntry, CachedBatchType}, - io_context::IoContext, + io_context::{EntryMetadata, entry_id_to_key}, observer::{CacheTracer, InternalEvent, Observer}, policies::{CachePolicy, HydrationPolicy, HydrationRequest, MaterializedEntry}, utils::CacheConfig, }; use crate::cache::DefaultSqueezeIo; -use crate::cache::policies::SqueezePolicy; +use crate::cache::policies::{SqueezeOutcome, SqueezePolicy}; use crate::cache::utils::{LiquidCompressorStates, arrow_to_bytes}; use crate::cache::{CacheExpression, LiquidExpr, index::ArtIndex, utils::EntryID}; -use crate::cache::{CacheStats, EventTrace}; +use crate::cache::{CacheFull, CacheStats, EventTrace}; use crate::liquid_array::{ LiquidSqueezedArrayRef, SqueezeIoHandler, SqueezedBacking, SqueezedDate32Array, VariantStructSqueezedArray, @@ -57,7 +57,9 @@ pub struct LiquidCache { hydration_policy: Box, squeeze_policy: Box, observer: Arc, - io_context: Arc, + metadata: Arc, + store: t4::Store, + squeeze_victims_concurrently: bool, } /// Builder returned by [`LiquidCache::insert`] for configuring cache writes. @@ -90,8 +92,8 @@ impl LiquidCache { memory_squeezed_liquid_entries += 1; memory_squeezed_liquid_bytes += array.get_array_memory_size(); } - CacheEntry::DiskLiquid(_) => disk_liquid_entries += 1, - CacheEntry::DiskArrow(_) => disk_arrow_entries += 1, + CacheEntry::DiskLiquid { .. } => disk_liquid_entries += 1, + CacheEntry::DiskArrow { .. } => disk_arrow_entries += 1, }); let memory_usage_bytes = self.budget.memory_usage_bytes(); @@ -110,7 +112,8 @@ impl LiquidCache { memory_squeezed_liquid_bytes, memory_usage_bytes, disk_usage_bytes, - max_cache_bytes: self.config.max_cache_bytes(), + max_memory_bytes: self.config.max_memory_bytes(), + max_disk_bytes: self.config.max_disk_bytes(), runtime, } } @@ -152,20 +155,20 @@ impl LiquidCache { match batch.as_ref() { CacheEntry::MemoryLiquid(array) => Some(array.clone()), - entry @ CacheEntry::DiskLiquid(_) => { + entry @ CacheEntry::DiskLiquid { .. } => { let liquid = self.read_disk_liquid_array(entry_id).await; self.maybe_hydrate(entry_id, entry, MaterializedEntry::Liquid(&liquid), None) .await; Some(liquid) } CacheEntry::MemorySqueezedLiquid(array) => match array.disk_backing() { - SqueezedBacking::Liquid => { + SqueezedBacking::Liquid(_) => { let liquid = self.read_disk_liquid_array(entry_id).await; Some(liquid) } - SqueezedBacking::Arrow => None, + SqueezedBacking::Arrow(_) => None, }, - CacheEntry::DiskArrow(_) | CacheEntry::MemoryArrow(_) => None, + CacheEntry::DiskArrow { .. } | CacheEntry::MemoryArrow(_) => None, } } @@ -209,16 +212,16 @@ impl LiquidCache { /// Get the compressor states of the cache. pub fn compressor_states(&self, entry_id: &EntryID) -> Arc { - self.io_context.get_compressor(entry_id) + self.metadata.get_compressor(entry_id) } /// Add a squeeze hint for an entry. pub fn add_squeeze_hint(&self, entry_id: &EntryID, expression: Arc) { - self.io_context.add_squeeze_hint(entry_id, expression); + self.metadata.add_squeeze_hint(entry_id, expression); } /// Flush all entries to disk. - pub async fn flush_all_to_disk(&self) { + pub async fn flush_all_to_disk(&self) -> Result<(), CacheFull> { let mut entires = Vec::new(); self.for_each_entry(|entry_id, batch| { entires.push((*entry_id, batch.clone())); @@ -227,19 +230,37 @@ impl LiquidCache { match &batch { CacheEntry::MemoryArrow(array) => { let bytes = arrow_to_bytes(array).expect("failed to convert arrow to bytes"); - self.write_batch_to_disk(entry_id, &batch, bytes).await; - self.try_insert(entry_id, CacheEntry::disk_arrow(array.data_type().clone())) - .expect("failed to insert disk arrow entry"); + let disk_bytes = bytes.len(); + match self.write_batch_to_disk(entry_id, &batch, bytes).await { + Ok(()) => { + self.try_insert( + entry_id, + CacheEntry::disk_arrow(array.data_type().clone(), disk_bytes), + ) + .expect("failed to insert disk arrow entry"); + } + Err(CacheFull) => self.drop_memory_entry(entry_id, &batch), + } } CacheEntry::MemoryLiquid(liquid_array) => { let liquid_bytes = liquid_array.to_bytes(); - self.write_batch_to_disk(entry_id, &batch, Bytes::from(liquid_bytes)) - .await; - self.try_insert( - entry_id, - CacheEntry::disk_liquid(liquid_array.original_arrow_data_type()), - ) - .expect("failed to insert disk liquid entry"); + let disk_bytes = liquid_bytes.len(); + match self + .write_batch_to_disk(entry_id, &batch, Bytes::from(liquid_bytes)) + .await + { + Ok(()) => { + self.try_insert( + entry_id, + CacheEntry::disk_liquid( + liquid_array.original_arrow_data_type(), + disk_bytes, + ), + ) + .expect("failed to insert disk liquid entry"); + } + Err(CacheFull) => self.drop_memory_entry(entry_id, &batch), + } } CacheEntry::MemorySqueezedLiquid(array) => { // We don't have to do anything, because it's already on disk @@ -247,11 +268,12 @@ impl LiquidCache { self.try_insert(entry_id, disk_entry) .expect("failed to insert disk entry"); } - CacheEntry::DiskArrow(_) | CacheEntry::DiskLiquid(_) => { + CacheEntry::DiskArrow { .. } | CacheEntry::DiskLiquid { .. } => { // Already on disk, skip } } } + Ok(()) } } @@ -261,70 +283,85 @@ impl LiquidCache { &self, entry_id: EntryID, batch: CacheEntry, - ) -> CacheEntry { + ) -> Result { match &batch { batch @ CacheEntry::MemoryArrow(_) => { let squeeze_io: Arc = Arc::new(DefaultSqueezeIo::new( - self.io_context.clone(), + self.store.clone(), entry_id, self.observer.clone(), )); - let (new_batch, bytes_to_write) = self.squeeze_policy.squeeze( + let outcome = self.squeeze_policy.squeeze( batch, - self.io_context.get_compressor(&entry_id).as_ref(), + self.metadata.get_compressor(&entry_id).as_ref(), None, &squeeze_io, ); + let SqueezeOutcome::Replace { + entry: new_batch, + bytes_to_write, + } = outcome + else { + unreachable!("memory arrow squeeze cannot remove entry"); + }; if let Some(bytes_to_write) = bytes_to_write { self.write_batch_to_disk(entry_id, &new_batch, bytes_to_write) - .await; + .await?; } - new_batch + Ok(new_batch) } CacheEntry::MemoryLiquid(liquid_array) => { let liquid_bytes = Bytes::from(liquid_array.to_bytes()); + let disk_bytes = liquid_bytes.len(); self.write_batch_to_disk(entry_id, &batch, liquid_bytes) - .await; - CacheEntry::disk_liquid(liquid_array.original_arrow_data_type()) + .await?; + Ok(CacheEntry::disk_liquid( + liquid_array.original_arrow_data_type(), + disk_bytes, + )) } CacheEntry::MemorySqueezedLiquid(squeezed_array) => { // The full data is already on disk, so we just need to mark ourself as disk entry - let backing = squeezed_array.disk_backing(); - if backing == SqueezedBacking::Liquid { - CacheEntry::disk_liquid(squeezed_array.original_arrow_data_type()) - } else { - CacheEntry::disk_arrow(squeezed_array.original_arrow_data_type()) - } + let data_type = squeezed_array.original_arrow_data_type(); + let entry = match squeezed_array.disk_backing() { + SqueezedBacking::Liquid(n) => CacheEntry::disk_liquid(data_type, n), + SqueezedBacking::Arrow(n) => CacheEntry::disk_arrow(data_type, n), + }; + Ok(entry) } - CacheEntry::DiskLiquid(_) | CacheEntry::DiskArrow(_) => { + CacheEntry::DiskLiquid { .. } | CacheEntry::DiskArrow { .. } => { unreachable!("Unexpected batch in write_in_memory_batch_to_disk") } } } /// Insert a batch into the cache, it will run cache replacement policy until the batch is inserted. - pub(crate) async fn insert_inner(&self, entry_id: EntryID, mut batch_to_cache: CacheEntry) { + pub(crate) async fn insert_inner( + &self, + entry_id: EntryID, + mut batch_to_cache: CacheEntry, + ) -> Result<(), CacheFull> { loop { let Err(not_inserted) = self.try_insert(entry_id, batch_to_cache) else { - return; + return Ok(()); }; self.trace(InternalEvent::InsertFailed { entry: entry_id, kind: CachedBatchType::from(¬_inserted), }); - let victims = self.cache_policy.find_victim(8); + let victims = self.cache_policy.find_memory_victim(8); if victims.is_empty() { // no advice, because the cache is already empty // this can happen if the entry to be inserted is too large, in that case, // we write it to disk let on_disk_batch = self .write_in_memory_batch_to_disk(entry_id, not_inserted) - .await; + .await?; batch_to_cache = on_disk_batch; continue; } - self.squeeze_victims(victims).await; + self.squeeze_victims(victims).await?; batch_to_cache = not_inserted; crate::utils::yield_now_if_shuttle(); @@ -332,24 +369,35 @@ impl LiquidCache { } /// Create a new instance of CacheStorage. + #[allow(clippy::too_many_arguments)] pub(crate) fn new( batch_size: usize, - max_cache_bytes: usize, + max_memory_bytes: usize, + max_disk_bytes: usize, squeeze_policy: Box, cache_policy: Box, hydration_policy: Box, - io_worker: Arc, + metadata: Arc, + store: t4::Store, + squeeze_victims_concurrently: bool, ) -> Self { - let config = CacheConfig::new(batch_size, max_cache_bytes); + let config = CacheConfig::new(batch_size, max_memory_bytes, max_disk_bytes); + let observer = Arc::new(Observer::new()); Self { index: ArtIndex::new(), - budget: BudgetAccounting::new(config.max_cache_bytes()), + budget: BudgetAccounting::new( + config.max_memory_bytes(), + config.max_disk_bytes(), + observer.clone(), + ), config, cache_policy, hydration_policy, squeeze_policy, - observer: Arc::new(Observer::new()), - io_context: io_worker, + observer, + metadata, + store, + squeeze_victims_concurrently, } } @@ -386,6 +434,46 @@ impl LiquidCache { Ok(()) } + fn drop_memory_entry(&self, entry_id: EntryID, _expected: &CacheEntry) { + let Some(removed) = self.index.remove(&entry_id) else { + return; + }; + assert!( + matches!( + removed.as_ref(), + CacheEntry::MemoryArrow(_) + | CacheEntry::MemoryLiquid(_) + | CacheEntry::MemorySqueezedLiquid(_) + ), + "flush should only drop memory entries" + ); + self.budget + .try_update_memory_usage(removed.memory_usage_bytes(), 0) + .expect("memory release cannot fail"); + self.cache_policy.notify_remove(&entry_id); + } + + async fn remove_disk_entry(&self, entry_id: EntryID) { + let Some(removed) = self.index.remove(&entry_id) else { + return; + }; + let disk_bytes = match removed.as_ref() { + CacheEntry::DiskLiquid { disk_bytes, .. } + | CacheEntry::DiskArrow { disk_bytes, .. } => *disk_bytes, + _ => panic!("remove_disk_entry called for non-disk entry"), + }; + self.store + .remove(&entry_id_to_key(&entry_id)) + .await + .expect("disk remove failed"); + self.budget.release_disk(disk_bytes); + self.cache_policy.notify_remove(&entry_id); + self.trace(InternalEvent::DiskEvict { + entry: entry_id, + bytes: disk_bytes, + }); + } + /// Consume the trace of the cache, for testing only. pub fn consume_event_trace(&self) -> EventTrace { self.observer.consume_event_trace() @@ -402,61 +490,80 @@ impl LiquidCache { } #[fastrace::trace] - async fn squeeze_victims(&self, victims: Vec) { - // Run squeeze operations sequentially using async I/O + async fn squeeze_victims(&self, victims: Vec) -> Result<(), CacheFull> { self.trace(InternalEvent::SqueezeBegin { victims: victims.clone(), }); - futures::stream::iter(victims) - .for_each_concurrent(None, |victim| async move { - self.squeeze_victim_inner(victim).await; - }) - .await; + if self.squeeze_victims_concurrently { + let results = futures::stream::iter(victims) + .map(|victim| self.squeeze_victim_inner(victim)) + .buffer_unordered(usize::MAX) + .collect::>() + .await; + results.into_iter().collect::, _>>()?; + } else { + for victim in victims { + self.squeeze_victim_inner(victim).await?; + } + } + Ok(()) } - async fn squeeze_victim_inner(&self, to_squeeze: EntryID) { + async fn squeeze_victim_inner(&self, to_squeeze: EntryID) -> Result<(), CacheFull> { let Some(mut to_squeeze_batch) = self.index.get(&to_squeeze) else { - return; + return Ok(()); }; self.trace(InternalEvent::SqueezeVictim { entry: to_squeeze }); - let compressor = self.io_context.get_compressor(&to_squeeze); - let squeeze_hint_arc = self.io_context.squeeze_hint(&to_squeeze); + let compressor = self.metadata.get_compressor(&to_squeeze); + let squeeze_hint_arc = self.metadata.squeeze_hint(&to_squeeze); let squeeze_hint = squeeze_hint_arc.as_deref(); let squeeze_io: Arc = Arc::new(DefaultSqueezeIo::new( - self.io_context.clone(), + self.store.clone(), to_squeeze, self.observer.clone(), )); loop { - let (new_batch, bytes_to_write) = self.squeeze_policy.squeeze( + let outcome = self.squeeze_policy.squeeze( to_squeeze_batch.as_ref(), compressor.as_ref(), squeeze_hint, &squeeze_io, ); - if let Some(bytes_to_write) = bytes_to_write { - self.write_batch_to_disk(to_squeeze, &new_batch, bytes_to_write) - .await; - } - match self.try_insert(to_squeeze, new_batch) { - Ok(()) => { - break; + match outcome { + SqueezeOutcome::Replace { + entry: new_batch, + bytes_to_write, + } => { + if let Some(bytes_to_write) = bytes_to_write { + self.write_batch_to_disk(to_squeeze, &new_batch, bytes_to_write) + .await?; + } + match self.try_insert(to_squeeze, new_batch) { + Ok(()) => { + break; + } + Err(batch) => { + to_squeeze_batch = Arc::new(batch); + } + } } - Err(batch) => { - to_squeeze_batch = Arc::new(batch); + SqueezeOutcome::Remove => { + self.remove_disk_entry(to_squeeze).await; + break; } } } + Ok(()) } fn disk_entry_from_squeezed(array: &LiquidSqueezedArrayRef) -> CacheEntry { - let constructor: fn(DataType) -> CacheEntry = match array.disk_backing() { - SqueezedBacking::Liquid => CacheEntry::disk_liquid, - SqueezedBacking::Arrow => CacheEntry::disk_arrow, - }; - constructor(array.original_arrow_data_type()) + let data_type = array.original_arrow_data_type(); + match array.disk_backing() { + SqueezedBacking::Liquid(n) => CacheEntry::disk_liquid(data_type, n), + SqueezedBacking::Arrow(n) => CacheEntry::disk_arrow(data_type, n), + } } async fn maybe_hydrate( @@ -466,7 +573,7 @@ impl LiquidCache { materialized: MaterializedEntry<'_>, expression: Option<&CacheExpression>, ) { - let compressor = self.io_context.get_compressor(entry_id); + let compressor = self.metadata.get_compressor(entry_id); if let Some(new_entry) = self.hydration_policy.hydrate(&HydrationRequest { entry_id: *entry_id, cached, @@ -481,7 +588,7 @@ impl LiquidCache { cached: cached_type, new: new_type, }); - self.insert_inner(*entry_id, new_entry).await; + let _ = self.insert_inner(*entry_id, new_entry).await; } } @@ -515,7 +622,7 @@ impl LiquidCache { Some(selection) => Some(array.filter(selection)), None => Some(array.to_arrow_array()), }, - CacheEntry::DiskArrow(_) | CacheEntry::DiskLiquid(_) => { + CacheEntry::DiskArrow { .. } | CacheEntry::DiskLiquid { .. } => { self.read_disk_array(batch.as_ref(), entry_id, expression, selection) .await } @@ -534,7 +641,7 @@ impl LiquidCache { selection: Option<&BooleanBuffer>, ) -> Option { match entry { - CacheEntry::DiskArrow(data_type) => { + CacheEntry::DiskArrow { data_type, .. } => { if let Some(selection) = selection && selection.count_set_bits() == 0 { @@ -556,7 +663,7 @@ impl LiquidCache { None => Some(full_array), } } - CacheEntry::DiskLiquid(data_type) => { + CacheEntry::DiskLiquid { data_type, .. } => { if let Some(selection) = selection && selection.count_set_bits() == 0 { @@ -680,31 +787,52 @@ impl LiquidCache { } } - async fn write_batch_to_disk(&self, entry_id: EntryID, batch: &CacheEntry, bytes: Bytes) { + async fn write_batch_to_disk( + &self, + entry_id: EntryID, + batch: &CacheEntry, + bytes: Bytes, + ) -> Result<(), CacheFull> { + let len = bytes.len(); + loop { + if self.budget.try_reserve_disk(len).is_ok() { + break; + } + let victims = self.cache_policy.find_disk_victim(8); + if victims.is_empty() { + return Err(CacheFull); + } + for victim in victims { + self.remove_disk_entry(victim).await; + } + } self.trace(InternalEvent::IoWrite { entry: entry_id, kind: CachedBatchType::from(batch), - bytes: bytes.len(), + bytes: len, }); - let len = bytes.len(); - self.io_context.write(&entry_id, bytes).await.unwrap(); - self.budget.add_used_disk_bytes(len); + self.store + .put(entry_id_to_key(&entry_id), bytes.to_vec()) + .await + .expect("write failed"); + Ok(()) } async fn read_disk_arrow_array(&self, entry_id: &EntryID) -> ArrayRef { let bytes = self - .io_context - .read(entry_id, None) + .store + .get(&entry_id_to_key(entry_id)) .await .expect("read failed"); - let cursor = std::io::Cursor::new(bytes.to_vec()); + let bytes_len = bytes.len(); + let cursor = std::io::Cursor::new(bytes); let mut reader = arrow::ipc::reader::StreamReader::try_new(cursor, None).expect("create reader failed"); let batch = reader.next().unwrap().expect("read batch failed"); let array = batch.column(0).clone(); self.trace(InternalEvent::IoReadArrow { entry: *entry_id, - bytes: bytes.len(), + bytes: bytes_len, }); array } @@ -714,19 +842,19 @@ impl LiquidCache { entry_id: &EntryID, ) -> crate::liquid_array::LiquidArrayRef { let bytes = self - .io_context - .read(entry_id, None) + .store + .get(&entry_id_to_key(entry_id)) .await .expect("read failed"); self.trace(InternalEvent::IoReadLiquid { entry: *entry_id, bytes: bytes.len(), }); - let compressor_states = self.io_context.get_compressor(entry_id); + let compressor_states = self.metadata.get_compressor(entry_id); let compressor = compressor_states.fsst_compressor(); (crate::liquid_array::ipc::read_from_bytes( - bytes, + Bytes::from(bytes), &crate::liquid_array::ipc::LiquidIPCContext::new(compressor), )) as _ } @@ -761,7 +889,7 @@ impl LiquidCache { .expect("selection must match array length"); Some(self.eval_predicate_on_array(filtered, predicate)) } - entry @ CacheEntry::DiskArrow(_) => { + entry @ CacheEntry::DiskArrow { .. } => { let array = self.read_disk_arrow_array(entry_id).await; self.maybe_hydrate(entry_id, entry, MaterializedEntry::Arrow(&array), None) .await; @@ -783,7 +911,7 @@ impl LiquidCache { }); Some(array.try_eval_predicate(predicate, selection)) } - entry @ CacheEntry::DiskLiquid(_) => { + entry @ CacheEntry::DiskLiquid { .. } => { let liquid = self.read_disk_liquid_array(entry_id).await; self.maybe_hydrate(entry_id, entry, MaterializedEntry::Liquid(&liquid), None) .await; @@ -838,11 +966,11 @@ impl LiquidCache { mod tests { use super::*; use crate::cache::{ - CacheEntry, CacheExpression, CachePolicy, LiquidCacheBuilder, TranscodeSqueezeEvict, - policies::LruPolicy, - transcode_liquid_inner, + CacheEntry, CacheExpression, CachePolicy, LiquidCacheBuilder, LiquidPolicy, + TranscodeSqueezeEvict, transcode_liquid_inner, utils::{ - LiquidCompressorStates, create_cache_store, create_test_array, create_test_arrow_array, + LiquidCompressorStates, arrow_to_bytes, create_cache_store, create_test_array, + create_test_arrow_array, }, }; use crate::liquid_array::{ @@ -872,7 +1000,7 @@ mod tests { } impl CachePolicy for TestPolicy { - fn find_victim(&self, _cnt: usize) -> Vec { + fn find_memory_victim(&self, _cnt: usize) -> Vec { self.advice_count.fetch_add(1, Ordering::SeqCst); let id_to_use = self.target_id.unwrap(); vec![id_to_use] @@ -883,7 +1011,7 @@ mod tests { async fn test_basic_cache_operations() { // Test basic insert, get, and size tracking in one test let budget_size = 10 * 1024; - let store = create_cache_store(budget_size, Box::new(LruPolicy::new())).await; + let store = create_cache_store(budget_size, Box::new(LiquidPolicy::new())).await; // 1. Initial budget should be empty assert_eq!(store.budget.memory_usage_bytes(), 0); @@ -892,7 +1020,7 @@ mod tests { let entry_id1: EntryID = EntryID::from(1); let array1 = create_test_array(100); let size1 = array1.memory_usage_bytes(); - store.insert_inner(entry_id1, array1).await; + store.insert_inner(entry_id1, array1).await.unwrap(); // Verify budget usage and data correctness assert_eq!(store.budget.memory_usage_bytes(), size1); @@ -905,13 +1033,13 @@ mod tests { let entry_id2: EntryID = EntryID::from(2); let array2 = create_test_array(200); let size2 = array2.memory_usage_bytes(); - store.insert_inner(entry_id2, array2).await; + store.insert_inner(entry_id2, array2).await.unwrap(); assert_eq!(store.budget.memory_usage_bytes(), size1 + size2); let array3 = create_test_array(150); let size3 = array3.memory_usage_bytes(); - store.insert_inner(entry_id1, array3).await; + store.insert_inner(entry_id1, array3).await.unwrap(); assert_eq!(store.budget.memory_usage_bytes(), size3 + size2); assert!(store.index().get(&EntryID::from(999)).is_none()); @@ -919,10 +1047,10 @@ mod tests { #[tokio::test] async fn get_arrow_array_with_expression_extracts_year() { - let store = create_cache_store(1 << 20, Box::new(LruPolicy::new())).await; + let store = create_cache_store(1 << 20, Box::new(LiquidPolicy::new())).await; let entry_id = EntryID::from(42); - let date_values = Date32Array::from(vec![Some(0), Some(365), None, Some(730)]); + let date_values = Date32Array::from(vec![Some(2), Some(365 + 1), None, Some(365 + 100)]); let liquid = LiquidPrimitiveArray::::from_arrow_array(date_values.clone()); let squeezed = SqueezedDate32Array::from_liquid_date32(&liquid, Date32Field::Year); let squeezed: LiquidSqueezedArrayRef = Arc::new(squeezed); @@ -932,7 +1060,8 @@ mod tests { entry_id, CacheEntry::memory_squeezed_liquid(squeezed.clone()), ) - .await; + .await + .unwrap(); let expr = Arc::new(CacheExpression::extract_date32(Date32Field::Year)); let result = store @@ -947,10 +1076,10 @@ mod tests { .downcast_ref::() .expect("date32 result"); assert_eq!(result.len(), 4); - assert_eq!(result.value(0), 1970); - assert_eq!(result.value(1), 1971); + assert_eq!(result.value(0), 0); + assert_eq!(result.value(1), 365); assert!(result.is_null(2)); - assert_eq!(result.value(3), 1972); + assert_eq!(result.value(3), 365); } #[tokio::test] @@ -966,13 +1095,19 @@ mod tests { let advisor = TestPolicy::new(Some(entry_id1)); let store = create_cache_store(8000, Box::new(advisor)).await; // Small budget to force advice - store.insert_inner(entry_id1, create_test_array(800)).await; + store + .insert_inner(entry_id1, create_test_array(800)) + .await + .unwrap(); match store.index().get(&entry_id1).unwrap().as_ref() { CacheEntry::MemoryArrow(_) => {} other => panic!("Expected ArrowMemory, got {other:?}"), } - store.insert_inner(entry_id2, create_test_array(800)).await; + store + .insert_inner(entry_id2, create_test_array(800)) + .await + .unwrap(); match store.index().get(&entry_id1).unwrap().as_ref() { CacheEntry::MemoryLiquid(_) => {} other => panic!("Expected LiquidMemory after eviction, got {other:?}"), @@ -985,13 +1120,13 @@ mod tests { concurrent_cache_operations().await; } - #[cfg(feature = "shuttle")] - #[test] - fn shuttle_cache_operations() { - crate::utils::shuttle_test(|| { - block_on(concurrent_cache_operations()); - }); - } + // #[cfg(feature = "shuttle")] + // #[test] + // fn shuttle_cache_operations() { + // crate::utils::shuttle_test(|| { + // block_on(concurrent_cache_operations()); + // }); + // } pub fn block_on(future: F) -> F::Output { #[cfg(feature = "shuttle")] @@ -1009,7 +1144,7 @@ mod tests { let ops_per_thread = 50; let budget_size = num_threads * ops_per_thread * 100 * 8 / 2; - let store = create_cache_store(budget_size, Box::new(LruPolicy::new())).await; + let store = create_cache_store(budget_size, Box::new(LiquidPolicy::new())).await; let mut handles = vec![]; for thread_id in 0..num_threads { @@ -1020,7 +1155,7 @@ mod tests { let unique_id = thread_id * ops_per_thread + i; let entry_id: EntryID = EntryID::from(unique_id); let array = create_test_arrow_array(100); - store.insert(entry_id, array).await; + store.insert(entry_id, array).await.unwrap(); } }); })); @@ -1046,7 +1181,7 @@ mod tests { async fn test_cache_stats_memory_and_disk_usage() { // Build a small cache in blocking liquid mode to avoid background tasks let storage = LiquidCacheBuilder::new() - .with_max_cache_bytes(10 * 1024 * 1024) + .with_max_memory_bytes(10 * 1024 * 1024) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build() .await; @@ -1054,18 +1189,18 @@ mod tests { // Insert two small batches let arr1: ArrayRef = Arc::new(Int32Array::from_iter_values(0..64)); let arr2: ArrayRef = Arc::new(Int32Array::from_iter_values(0..128)); - storage.insert(EntryID::from(1usize), arr1).await; - storage.insert(EntryID::from(2usize), arr2).await; + storage.insert(EntryID::from(1usize), arr1).await.unwrap(); + storage.insert(EntryID::from(2usize), arr2).await.unwrap(); // Stats after insert: 2 entries, memory usage > 0, disk usage == 0 let s = storage.stats(); assert_eq!(s.total_entries, 2); assert!(s.memory_usage_bytes > 0); assert_eq!(s.disk_usage_bytes, 0); - assert_eq!(s.max_cache_bytes, 10 * 1024 * 1024); + assert_eq!(s.max_memory_bytes, 10 * 1024 * 1024); // Flush to disk and verify memory usage drops and disk usage increases - storage.flush_all_to_disk().await; + storage.flush_all_to_disk().await.unwrap(); let s2 = storage.stats(); assert_eq!(s2.total_entries, 2); assert!(s2.disk_usage_bytes > 0); @@ -1075,15 +1210,15 @@ mod tests { #[tokio::test] async fn hydrate_disk_arrow_on_get_promotes_to_memory() { - let store = create_cache_store(1 << 20, Box::new(LruPolicy::new())).await; + let store = create_cache_store(1 << 20, Box::new(LiquidPolicy::new())).await; let entry_id = EntryID::from(321usize); let array = create_test_arrow_array(8); - store.insert(entry_id, array.clone()).await; - store.flush_all_to_disk().await; + store.insert(entry_id, array.clone()).await.unwrap(); + store.flush_all_to_disk().await.unwrap(); { let entry = store.index().get(&entry_id).unwrap(); - assert!(matches!(entry.as_ref(), CacheEntry::DiskArrow(_))); + assert!(matches!(entry.as_ref(), CacheEntry::DiskArrow { .. })); } let result = store.get(&entry_id).await.expect("present"); @@ -1096,7 +1231,7 @@ mod tests { #[tokio::test] async fn hydrate_disk_liquid_on_get_promotes_to_memory_liquid() { - let store = create_cache_store(1 << 20, Box::new(LruPolicy::new())).await; + let store = create_cache_store(1 << 20, Box::new(LiquidPolicy::new())).await; let entry_id = EntryID::from(322usize); let arrow_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4])); let compressor = LiquidCompressorStates::new(); @@ -1104,11 +1239,12 @@ mod tests { store .insert_inner(entry_id, CacheEntry::memory_liquid(liquid.clone())) - .await; - store.flush_all_to_disk().await; + .await + .unwrap(); + store.flush_all_to_disk().await.unwrap(); { let entry = store.index().get(&entry_id).unwrap(); - assert!(matches!(entry.as_ref(), CacheEntry::DiskLiquid(_))); + assert!(matches!(entry.as_ref(), CacheEntry::DiskLiquid { .. })); } let result = store.get(&entry_id).await.expect("present"); @@ -1118,4 +1254,113 @@ mod tests { assert!(matches!(entry.as_ref(), CacheEntry::MemoryLiquid(_))); } } + + #[tokio::test] + async fn insert_returns_cache_full_when_memory_and_disk_are_saturated() { + let cache = LiquidCacheBuilder::new() + .with_max_memory_bytes(0) + .with_max_disk_bytes(0) + .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) + .build() + .await; + let array: ArrayRef = Arc::new(Int32Array::from_iter_values(0..16)); + + let err = cache.insert(EntryID::from(900usize), array).await; + + assert_eq!(err, Err(CacheFull)); + assert!(!cache.is_cached(&EntryID::from(900usize))); + } + + #[tokio::test] + async fn insert_until_disk_full_then_evicts_oldest_disk_entry() { + let first_array: ArrayRef = Arc::new(Int32Array::from_iter_values(0..16)); + let second_array: ArrayRef = Arc::new(Int32Array::from_iter_values(16..32)); + let first_bytes = arrow_to_bytes(&first_array).unwrap().len(); + let second_bytes = arrow_to_bytes(&second_array).unwrap().len(); + let cache = LiquidCacheBuilder::new() + .with_max_memory_bytes(1 << 20) + .with_max_disk_bytes(first_bytes.max(second_bytes)) + .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) + .with_cache_policy(Box::new(LiquidPolicy::new())) + .build() + .await; + + let first = EntryID::from(910usize); + let second = EntryID::from(911usize); + cache.insert(first, first_array).await.unwrap(); + cache.flush_all_to_disk().await.unwrap(); + assert!(cache.is_cached(&first)); + + cache.insert(second, second_array).await.unwrap(); + cache.flush_all_to_disk().await.unwrap(); + + assert!(!cache.is_cached(&first)); + assert!(matches!( + cache.index().get(&second).unwrap().as_ref(), + CacheEntry::DiskArrow { .. } + )); + } + + #[tokio::test] + async fn flush_all_to_disk_evicts_when_overflow() { + let first_array: ArrayRef = Arc::new(Int32Array::from_iter_values(0..16)); + let second_array: ArrayRef = Arc::new(Int32Array::from_iter_values(16..32)); + let disk_bytes = arrow_to_bytes(&first_array).unwrap().len(); + let cache = LiquidCacheBuilder::new() + .with_max_memory_bytes(1 << 20) + .with_max_disk_bytes(disk_bytes) + .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) + .with_cache_policy(Box::new(LiquidPolicy::new())) + .build() + .await; + let first = EntryID::from(912usize); + let second = EntryID::from(913usize); + cache.insert(first, first_array).await.unwrap(); + cache.flush_all_to_disk().await.unwrap(); + cache.insert(second, second_array).await.unwrap(); + + cache.flush_all_to_disk().await.unwrap(); + + assert!(!cache.is_cached(&first) || !cache.is_cached(&second)); + } + + #[tokio::test] + async fn disk_eviction_releases_budget() { + let array: ArrayRef = Arc::new(Int32Array::from_iter_values(0..16)); + let disk_bytes = arrow_to_bytes(&array).unwrap().len(); + let cache = LiquidCacheBuilder::new() + .with_max_memory_bytes(1 << 20) + .with_max_disk_bytes(disk_bytes) + .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) + .with_cache_policy(Box::new(LiquidPolicy::new())) + .build() + .await; + let entry = EntryID::from(914usize); + cache.insert(entry, array).await.unwrap(); + cache.flush_all_to_disk().await.unwrap(); + let before = cache.stats().disk_usage_bytes; + + cache.remove_disk_entry(entry).await; + + assert_eq!(cache.stats().disk_usage_bytes, before - disk_bytes); + assert!(!cache.is_cached(&entry)); + } + + #[tokio::test] + async fn flush_all_to_disk_drops_entry_on_unrecoverable_overflow() { + let cache = LiquidCacheBuilder::new() + .with_max_memory_bytes(1 << 20) + .with_max_disk_bytes(0) + .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) + .build() + .await; + let entry_id = EntryID::from(901usize); + let array: ArrayRef = Arc::new(Int32Array::from_iter_values(0..16)); + cache.insert(entry_id, array).await.unwrap(); + + let result = cache.flush_all_to_disk().await; + + assert_eq!(result, Ok(())); + assert!(!cache.is_cached(&entry_id)); + } } diff --git a/src/core/src/cache/index.rs b/src/core/src/cache/index.rs index b2a84edd..a25fec75 100644 --- a/src/core/src/cache/index.rs +++ b/src/core/src/cache/index.rs @@ -51,6 +51,15 @@ impl ArtIndex { } } + pub(crate) fn remove(&self, entry_id: &EntryID) -> Option> { + let guard = self.art.pin(); + let removed = self.art.remove(*entry_id, &guard); + if removed.is_some() { + self.entry_count.fetch_sub(1, Ordering::Relaxed); + } + removed + } + pub(crate) fn reset(&self) { let guard = self.art.pin(); self.art.keys().into_iter().for_each(|k| { diff --git a/src/core/src/cache/io_context.rs b/src/core/src/cache/io_context.rs index 8f804fb2..d03a8184 100644 --- a/src/core/src/cache/io_context.rs +++ b/src/core/src/cache/io_context.rs @@ -13,12 +13,12 @@ use crate::{ liquid_array::SqueezeIoHandler, }; -/// A trait for objects that can handle IO operations for the cache. +/// Per-entry metadata used by the cache. /// -/// All IO is key-based: entries are identified by their [`EntryID`] and stored -/// in a [`t4::Store`] rather than as individual files on disk. -#[async_trait::async_trait] -pub trait IoContext: Debug + Send + Sync { +/// This trait covers only the metadata side of the cache: where to find a +/// batch's compressor and squeeze hints. All actual byte IO goes through the +/// [`t4::Store`] held by the cache itself. +pub trait EntryMetadata: Debug + Send + Sync { /// Add a squeeze hint for an entry. fn add_squeeze_hint(&self, _entry_id: &EntryID, _expression: Arc) { // Do nothing by default @@ -36,44 +36,34 @@ pub trait IoContext: Debug + Send + Sync { /// Get the compressor for an entry. fn get_compressor(&self, entry_id: &EntryID) -> Arc; - - /// Read bytes for the given entry, optionally restricted to the provided range. - async fn read( - &self, - entry_id: &EntryID, - range: Option>, - ) -> Result; - - /// Write data for the given entry. - async fn write(&self, entry_id: &EntryID, data: Bytes) -> Result<(), std::io::Error>; } /// Convert an [`EntryID`] to a t4 key (8-byte little-endian representation). -fn entry_id_to_key(entry_id: &EntryID) -> Vec { +pub(crate) fn entry_id_to_key(entry_id: &EntryID) -> Vec { usize::from(*entry_id).to_le_bytes().to_vec() } -/// A default implementation of [`IoContext`] backed by a [`t4::Store`]. -#[derive(Debug)] -pub struct DefaultIoContext { +/// A default implementation of [`EntryMetadata`]. +/// +/// All entries share a single [`LiquidCompressorStates`] and squeeze hints are +/// stored in a flat map keyed by [`EntryID`]. +#[derive(Debug, Default)] +pub struct DefaultCacheMetadata { compressor_state: Arc, squeeze_hints: RwLock>>, - store: t4::Store, } -impl DefaultIoContext { - /// Create a new instance of [`DefaultIoContext`] backed by the given [`t4::Store`]. - pub fn new(store: t4::Store) -> Self { +impl DefaultCacheMetadata { + /// Create a new instance of [`DefaultCacheMetadata`]. + pub fn new() -> Self { Self { compressor_state: Arc::new(LiquidCompressorStates::new()), - store, squeeze_hints: RwLock::new(AHashMap::new()), } } } -#[async_trait::async_trait] -impl IoContext for DefaultIoContext { +impl EntryMetadata for DefaultCacheMetadata { fn add_squeeze_hint(&self, entry_id: &EntryID, expression: Arc) { let mut guard = self.squeeze_hints.write().unwrap(); guard.insert(*entry_id, expression); @@ -87,57 +77,21 @@ impl IoContext for DefaultIoContext { fn get_compressor(&self, _entry_id: &EntryID) -> Arc { self.compressor_state.clone() } - - async fn read( - &self, - entry_id: &EntryID, - range: Option>, - ) -> Result { - let key = entry_id_to_key(entry_id); - match range { - Some(range) => { - let len = range.end - range.start; - let bytes = self - .store - .get_range(&key, range.start, len) - .await - .map_err(|e| std::io::Error::other(e.to_string()))?; - Ok(Bytes::from(bytes)) - } - None => { - let bytes = self - .store - .get(&key) - .await - .map_err(|e| std::io::Error::other(e.to_string()))?; - Ok(Bytes::from(bytes)) - } - } - } - - async fn write(&self, entry_id: &EntryID, data: Bytes) -> Result<(), std::io::Error> { - let key = entry_id_to_key(entry_id); - self.store - .put(key, data.to_vec()) - .await - .map_err(|e| std::io::Error::other(e.to_string()))?; - Ok(()) - } } -/// A default implementation of [SqueezeIoHandler] that uses the default [IoContext]. +/// A default implementation of [SqueezeIoHandler] backed by a [`t4::Store`]. #[derive(Debug)] pub struct DefaultSqueezeIo { - io_context: Arc, + store: t4::Store, entry_id: EntryID, observer: Arc, } impl DefaultSqueezeIo { /// Create a new instance of [DefaultSqueezeIo]. - pub fn new(io_context: Arc, entry_id: EntryID, observer: Arc) -> Self { + pub fn new(store: t4::Store, entry_id: EntryID, observer: Arc) -> Self { Self { - io_context, + store, entry_id, observer, } @@ -147,7 +101,22 @@ impl DefaultSqueezeIo { #[async_trait::async_trait] impl SqueezeIoHandler for DefaultSqueezeIo { async fn read(&self, range: Option>) -> std::io::Result { - let bytes = self.io_context.read(&self.entry_id, range).await?; + let key = entry_id_to_key(&self.entry_id); + let bytes = match range { + Some(range) => { + let len = range.end - range.start; + self.store + .get_range(&key, range.start, len) + .await + .map_err(|e| std::io::Error::other(e.to_string()))? + } + None => self + .store + .get(&key) + .await + .map_err(|e| std::io::Error::other(e.to_string()))?, + }; + let bytes = Bytes::from(bytes); self.observer .record_internal(InternalEvent::IoReadSqueezedBacking { entry: self.entry_id, diff --git a/src/core/src/cache/mod.rs b/src/core/src/cache/mod.rs index 663f19fd..daf9c46e 100644 --- a/src/core/src/cache/mod.rs +++ b/src/core/src/cache/mod.rs @@ -19,7 +19,7 @@ pub use core::LiquidCache; pub use expressions::{CacheExpression, VariantRequest}; #[cfg(test)] pub(crate) use io_context::TestSqueezeIo; -pub use io_context::{DefaultIoContext, DefaultSqueezeIo, IoContext}; +pub use io_context::{DefaultCacheMetadata, DefaultSqueezeIo, EntryMetadata}; pub use liquid_expr::LiquidExpr; pub use observer::EventTrace; pub use observer::Observer; @@ -31,6 +31,10 @@ pub use policies::{ pub use transcode::{transcode_liquid_inner, transcode_liquid_inner_with_hint}; pub use utils::{EntryID, LiquidCompressorStates}; +/// The cache could not reserve enough disk budget for a write. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct CacheFull; + // Backwards-compatible module paths for existing imports. /// Legacy path: re-export cache policy types under `cache::cache_policies`. pub mod cache_policies { diff --git a/src/core/src/cache/observer/internal_tracing.rs b/src/core/src/cache/observer/internal_tracing.rs index b73e8ad6..21c4a534 100644 --- a/src/core/src/cache/observer/internal_tracing.rs +++ b/src/core/src/cache/observer/internal_tracing.rs @@ -24,6 +24,10 @@ pub(crate) enum InternalEvent { kind: CachedBatchType, bytes: usize, }, + DiskEvict { + entry: EntryID, + bytes: usize, + }, IoReadSqueezedBacking { entry: EntryID, bytes: usize, @@ -119,6 +123,14 @@ impl fmt::Display for InternalEvent { bytes ) } + InternalEvent::DiskEvict { entry, bytes } => { + write!( + f, + "event=disk_evict entry={} bytes={}", + usize::from(*entry), + bytes + ) + } InternalEvent::IoReadSqueezedBacking { entry, bytes } => { write!( f, diff --git a/src/core/src/cache/observer/mod.rs b/src/core/src/cache/observer/mod.rs index cba1c300..38de5fc4 100644 --- a/src/core/src/cache/observer/mod.rs +++ b/src/core/src/cache/observer/mod.rs @@ -99,9 +99,15 @@ impl Observer { self.runtime.incr_hit_date32_expression(); } + #[inline] + pub(crate) fn on_disk_reservation_failure(&self) { + self.runtime.incr_disk_reservation_failures(); + } + pub(crate) fn record_internal(&self, event: InternalEvent) { match event { InternalEvent::IoWrite { .. } => self.runtime.incr_write_io_count(), + InternalEvent::DiskEvict { .. } => self.runtime.incr_disk_evictions(), InternalEvent::IoReadArrow { .. } | InternalEvent::IoReadLiquid { .. } => { self.runtime.incr_read_io_count() } diff --git a/src/core/src/cache/observer/stats.rs b/src/core/src/cache/observer/stats.rs index f81d57f5..fa0c3d9a 100644 --- a/src/core/src/cache/observer/stats.rs +++ b/src/core/src/cache/observer/stats.rs @@ -103,6 +103,8 @@ define_runtime_stats! { (hit_date32_expression_calls, "Number of `hit_date32_expression` calls.", incr_hit_date32_expression), (read_io_count, "Number of read IO operations.", incr_read_io_count), (write_io_count, "Number of write IO operations.", incr_write_io_count), + (disk_evictions, "Number of disk cache entries evicted.", incr_disk_evictions), + (disk_reservation_failures, "Number of failed disk budget reservations.", incr_disk_reservation_failures), (eval_predicate_on_liquid_failed, "Number of `eval_predicate` calls that failed on Liquid array.", incr_eval_predicate_on_liquid_failed), (squeezed_decompressed_count, "Number of decompressed Squeezed-Liquid entries.", __incr_squeezed_decompressed_count), (squeezed_total_count, "Total number of Squeezed-Liquid entries.", __incr_squeezed_total_count), @@ -144,8 +146,10 @@ pub struct CacheStats { pub memory_usage_bytes: usize, /// Total disk usage of the cache. pub disk_usage_bytes: usize, - /// Maximum cache size. - pub max_cache_bytes: usize, + /// Maximum memory size. + pub max_memory_bytes: usize, + /// Maximum disk size. + pub max_disk_bytes: usize, /// Runtime counters snapshot. pub runtime: RuntimeStatsSnapshot, } diff --git a/src/core/src/cache/policies/cache/clock.rs b/src/core/src/cache/policies/cache/clock.rs deleted file mode 100644 index c9b40555..00000000 --- a/src/core/src/cache/policies/cache/clock.rs +++ /dev/null @@ -1,361 +0,0 @@ -//! CLOCK (second-chance) cache policy implementation with optional size awareness. - -use std::{collections::HashMap, fmt, ptr::NonNull, sync::Arc}; - -use crate::{ - cache::{cached_batch::CachedBatchType, utils::EntryID}, - sync::Mutex, -}; - -use super::{ - CachePolicy, - doubly_linked_list::{DoublyLinkedList, DoublyLinkedNode, drop_boxed_node}, -}; - -type ClockEntrySizeFn = Option usize + Send + Sync>>; - -/// The CLOCK (second-chance) eviction policy with optional size awareness. -#[derive(Default)] -pub struct ClockPolicy { - state: Mutex, - size_of: ClockEntrySizeFn, -} - -#[derive(Debug)] -struct ClockNode { - entry_id: EntryID, - referenced: bool, -} - -type NodePtr = NonNull>; - -#[derive(Debug, Default)] -struct ClockInternalState { - map: HashMap, - list: DoublyLinkedList, - hand: Option, - total_size: usize, -} - -impl fmt::Debug for ClockPolicy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("ClockPolicy") - .field("state", &self.state) - .finish() - } -} - -impl ClockPolicy { - /// Create a new CLOCK policy. - pub fn new() -> Self { - Self::new_with_size_fn(None) - } - - /// Create a new CLOCK policy with size awareness. - pub fn new_with_size_fn(size_of: ClockEntrySizeFn) -> Self { - ClockPolicy { - state: Mutex::new(ClockInternalState::default()), - size_of, - } - } - - fn entry_size(&self, entry_id: &EntryID) -> usize { - self.size_of.as_ref().map(|f| f(entry_id)).unwrap_or(1) - } -} - -unsafe impl Send for ClockPolicy {} -unsafe impl Sync for ClockPolicy {} - -impl CachePolicy for ClockPolicy { - fn find_victim(&self, cnt: usize) -> Vec { - let mut state = self.state.lock().unwrap(); - if cnt == 0 { - return Vec::new(); - } - - let mut evicted = Vec::with_capacity(cnt); - let mut cursor = match state.hand { - Some(ptr) => Some(ptr), - None => state.list.head(), - }; - - for _ in 0..cnt { - loop { - let Some(handle) = cursor else { - state.hand = None; - break; - }; - - let mut handle_ptr = handle; - if unsafe { handle_ptr.as_ref() }.data.referenced { - unsafe { handle_ptr.as_mut() }.data.referenced = false; - let next = unsafe { handle_ptr.as_ref().next }.or(state.list.head()); - cursor = next; - state.hand = next; - } else { - let victim_id = unsafe { handle_ptr.as_ref().data.entry_id }; - let succ = unsafe { handle_ptr.as_ref().next }; - state - .map - .remove(&victim_id) - .expect("pointer must exist in map"); - unsafe { - state.list.unlink(handle_ptr); - drop_boxed_node(handle_ptr); - } - state.total_size -= self.entry_size(&victim_id); - state.hand = succ.or(state.list.head()); - evicted.push(victim_id); - cursor = state.hand; - break; - } - } - - if state.hand.is_none() { - break; - } - } - - evicted - } - - fn notify_insert(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - - if let Some(mut existing) = state.map.get(entry_id).copied() { - unsafe { - existing.as_mut().data.referenced = true; - } - return; - } - - let node = DoublyLinkedNode::new(ClockNode { - entry_id: *entry_id, - referenced: true, - }); - let new_ptr = NonNull::from(Box::leak(node)); - - unsafe { state.list.push_back(new_ptr) }; - if state.hand.is_none() { - state.hand = Some(new_ptr); - } - - state.map.insert(*entry_id, new_ptr); - state.total_size += self.entry_size(entry_id); - } - - fn notify_access(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let state = self.state.lock().unwrap(); - if let Some(mut handle) = state.map.get(entry_id).copied() { - unsafe { - handle.as_mut().data.referenced = true; - } - } - } -} - -impl Drop for ClockPolicy { - fn drop(&mut self) { - if let Ok(mut state) = self.state.lock() { - let handles: Vec<_> = state.map.drain().map(|(_, ptr)| ptr).collect(); - for ptr in handles { - unsafe { - state.list.unlink(ptr); - drop_boxed_node(ptr); - } - } - unsafe { - state.list.drop_all(); - } - state.hand = None; - state.total_size = 0; - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::cache::{ - cached_batch::CacheEntry, - utils::{EntryID, create_cache_store, create_test_arrow_array}, - }; - - fn entry(id: usize) -> EntryID { - id.into() - } - - #[test] - fn test_clock_policy_insertion_order() { - let advisor = ClockPolicy::new(); - - let entry_id1 = EntryID::from(1); - let entry_id2 = EntryID::from(2); - let entry_id3 = EntryID::from(3); - - advisor.notify_insert(&entry_id1, CachedBatchType::MemoryArrow); - advisor.notify_insert(&entry_id2, CachedBatchType::MemoryArrow); - advisor.notify_insert(&entry_id3, CachedBatchType::MemoryArrow); - - assert_eq!(advisor.find_victim(1), vec![entry_id1]); - } - - #[test] - fn test_clock_policy_sequential_evictions() { - let advisor = ClockPolicy::new(); - - let entry_id1 = EntryID::from(1); - let entry_id2 = EntryID::from(2); - let entry_id3 = EntryID::from(3); - - advisor.notify_insert(&entry_id1, CachedBatchType::MemoryArrow); - advisor.notify_insert(&entry_id2, CachedBatchType::MemoryArrow); - advisor.notify_insert(&entry_id3, CachedBatchType::MemoryArrow); - - assert_eq!(advisor.find_victim(1), vec![entry_id1]); - assert_eq!(advisor.find_victim(1), vec![entry_id2]); - assert_eq!(advisor.find_victim(1), vec![entry_id3]); - } - - #[test] - fn test_clock_policy_single_item() { - let advisor = ClockPolicy::new(); - - let entry_id1 = EntryID::from(1); - advisor.notify_insert(&entry_id1, CachedBatchType::MemoryArrow); - - assert_eq!(advisor.find_victim(1), vec![entry_id1]); - } - - #[test] - fn test_clock_policy_advise_empty() { - let advisor = ClockPolicy::new(); - - assert_eq!(advisor.find_victim(1), vec![]); - } - - #[tokio::test] - async fn test_clock_policy_integration_with_store() { - let advisor = ClockPolicy::new(); - let store = create_cache_store(3100, Box::new(advisor)).await; - - let entry_id1 = EntryID::from(1); - let entry_id2 = EntryID::from(2); - let entry_id3 = EntryID::from(3); - - store.insert(entry_id1, create_test_arrow_array(100)).await; - store.insert(entry_id2, create_test_arrow_array(100)).await; - store.insert(entry_id3, create_test_arrow_array(100)).await; - - let entry_id4 = EntryID::from(4); - store.insert(entry_id4, create_test_arrow_array(100)).await; - - let data = store.index().get(&entry_id1).unwrap(); - assert!(matches!(data.as_ref(), CacheEntry::DiskLiquid(_))); - assert!(store.index().get(&entry_id2).is_some()); - assert!(store.index().get(&entry_id3).is_some()); - assert!(store.index().get(&entry_id4).is_some()); - } - - #[test] - fn test_clock_policy_size_awareness_with_closure() { - let policy = - ClockPolicy::new_with_size_fn(Some(Arc::new( - |id: &EntryID| { - if id.gt(&entry(10)) { 100 } else { 1 } - }, - ))); - - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(11); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - assert_eq!(state.total_size, 102); - } - - #[test] - fn test_clock_policy_size_awareness_without_closure() { - let policy = ClockPolicy::new(); - - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(11); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - assert_eq!(state.total_size, 3); - } - - #[test] - fn test_clock_policy_size_tracking_on_eviction() { - let policy = - ClockPolicy::new_with_size_fn(Some(Arc::new( - |id: &EntryID| { - if id.gt(&entry(10)) { 100 } else { 1 } - }, - ))); - - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(11); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - { - let state = policy.state.lock().unwrap(); - assert_eq!(state.total_size, 102); - } - - let evicted = policy.find_victim(1); - assert_eq!(evicted, vec![e1]); - - { - let state = policy.state.lock().unwrap(); - assert_eq!(state.total_size, 101); - } - - let evicted = policy.find_victim(1); - assert_eq!(evicted, vec![e2]); - - { - let state = policy.state.lock().unwrap(); - assert_eq!(state.total_size, 100); - } - } - - #[test] - fn test_clock_policy_reinsert_sets_reference_bit() { - let policy = ClockPolicy::new(); - let entry_id = entry(42); - - policy.notify_insert(&entry_id, CachedBatchType::MemoryArrow); - - { - let state = policy.state.lock().unwrap(); - let mut node_ptr = state.map.get(&entry_id).copied().unwrap(); - unsafe { - node_ptr.as_mut().data.referenced = false; - } - } - - policy.notify_insert(&entry_id, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - let node_ptr = state.map.get(&entry_id).copied().unwrap(); - unsafe { - assert!(node_ptr.as_ref().data.referenced); - } - assert_eq!(state.map.len(), 1); - } -} diff --git a/src/core/src/cache/policies/cache/doubly_linked_list.rs b/src/core/src/cache/policies/cache/doubly_linked_list.rs index e85c6fdc..1a48caaf 100644 --- a/src/core/src/cache/policies/cache/doubly_linked_list.rs +++ b/src/core/src/cache/policies/cache/doubly_linked_list.rs @@ -43,6 +43,7 @@ impl DoublyLinkedList { self.head } + #[allow(dead_code)] pub(crate) fn tail(&self) -> Option>> { self.tail } @@ -78,6 +79,7 @@ impl DoublyLinkedList { } /// Moves an existing node to the front of the list. + #[allow(dead_code)] pub(crate) unsafe fn move_to_front(&mut self, node_ptr: NonNull>) { unsafe { self.unlink(node_ptr); @@ -121,65 +123,3 @@ impl DoublyLinkedList { pub(crate) unsafe fn drop_boxed_node(ptr: NonNull>) { unsafe { drop(Box::from_raw(ptr.as_ptr())) } } - -#[cfg_attr(not(rust_analyzer), cfg(kani))] -mod proofs { - use super::*; - use kani::any; - use std::ptr::NonNull; - - #[cfg_attr(not(rust_analyzer), kani::proof)] - #[cfg_attr(not(rust_analyzer), kani::unwind(12))] - fn kani_linked_list_push_front() { - let mut list = DoublyLinkedList::::new(); - - // Choose n in [0, 3] non-deterministically - let n_raw: u8 = any(); - kani::assume(n_raw < 4); - let n: usize = n_raw as usize; - - // Track first (future tail) and last (future head) inserted nodes - let mut first_inserted: Option>> = None; - let mut last_inserted: Option>> = None; - - for i in 0..n { - let value: u8 = any(); - let boxed = DoublyLinkedNode::new(value); - let ptr = NonNull::from(Box::leak(boxed)); - if i == 0 { - first_inserted = Some(ptr); - } - last_inserted = Some(ptr); - unsafe { list.push_front(ptr) }; - } - - match n { - 0 => { - assert!(list.head().is_none()); - assert!(list.tail().is_none()); - } - _ => { - let head_ptr = list.head().expect("non-empty list must have head"); - let tail_ptr = list.tail().expect("non-empty list must have tail"); - assert_eq!(head_ptr.as_ptr(), last_inserted.unwrap().as_ptr()); - assert_eq!(tail_ptr.as_ptr(), first_inserted.unwrap().as_ptr()); - - let head_ref = unsafe { head_ptr.as_ref() }; - assert!(head_ref.prev.is_none()); - let tail_ref = unsafe { tail_ptr.as_ref() }; - assert!(tail_ref.next.is_none()); - } - } - - // Count entries by traversing from head using next pointers - let mut count: usize = 0; - let mut current = list.head(); - while let Some(node_ptr) = current { - count += 1; - current = unsafe { node_ptr.as_ref().next }; - } - assert_eq!(count, n); - - unsafe { list.drop_all() }; - } -} diff --git a/src/core/src/cache/policies/cache/filo.rs b/src/core/src/cache/policies/cache/filo.rs deleted file mode 100644 index 41269f36..00000000 --- a/src/core/src/cache/policies/cache/filo.rs +++ /dev/null @@ -1,295 +0,0 @@ -//! FILO (First In, Last Out) and FIFO cache policy implementations. - -use std::{collections::HashMap, ptr::NonNull}; - -use crate::{ - cache::{cached_batch::CachedBatchType, utils::EntryID}, - sync::Mutex, -}; - -use super::{ - CachePolicy, - doubly_linked_list::{DoublyLinkedList, DoublyLinkedNode, drop_boxed_node}, -}; - -#[derive(Debug)] -struct QueueNode { - entry_id: EntryID, -} - -type NodePtr = NonNull>; - -#[derive(Debug, Default)] -struct QueueState { - map: HashMap, - list: DoublyLinkedList, -} - -impl QueueState { - fn is_empty(&self) -> bool { - self.list.head().is_none() - } - - fn insert_front(&mut self, entry_id: EntryID) { - if let Some(ptr) = self.map.get(&entry_id).copied() { - unsafe { - self.list.unlink(ptr); - self.list.push_front(ptr); - } - return; - } - - let node = DoublyLinkedNode::new(QueueNode { entry_id }); - let ptr = NonNull::from(Box::leak(node)); - - self.map.insert(entry_id, ptr); - unsafe { - self.list.push_front(ptr); - } - } - - fn insert_back(&mut self, entry_id: EntryID) { - if let Some(ptr) = self.map.get(&entry_id).copied() { - unsafe { - self.list.unlink(ptr); - self.list.push_back(ptr); - } - return; - } - - let node = DoublyLinkedNode::new(QueueNode { entry_id }); - let ptr = NonNull::from(Box::leak(node)); - - self.map.insert(entry_id, ptr); - unsafe { - self.list.push_back(ptr); - } - } - - fn pop_front(&mut self) -> Option { - let head_ptr = self.list.head()?; - let entry_id = unsafe { head_ptr.as_ref().data.entry_id }; - let node_ptr = self - .map - .remove(&entry_id) - .expect("head pointer must have map entry"); - unsafe { - self.list.unlink(node_ptr); - drop_boxed_node(node_ptr); - } - Some(entry_id) - } -} - -impl Drop for QueueState { - fn drop(&mut self) { - let handles: Vec<_> = self.map.drain().map(|(_, ptr)| ptr).collect(); - for ptr in handles { - unsafe { - self.list.unlink(ptr); - drop_boxed_node(ptr); - } - } - unsafe { - self.list.drop_all(); - } - } -} - -/// The policy that implements the FILO (First In, Last Out) algorithm. -/// Newest entries are evicted first. -#[derive(Debug, Default)] -pub struct FiloPolicy { - state: Mutex, -} - -impl FiloPolicy { - /// Create a new [`FiloPolicy`]. - pub fn new() -> Self { - Self { - state: Mutex::new(QueueState::default()), - } - } -} - -// SAFETY: Access to raw pointers is protected by the internal `Mutex`. -unsafe impl Send for FiloPolicy {} -unsafe impl Sync for FiloPolicy {} - -impl CachePolicy for FiloPolicy { - fn find_victim(&self, cnt: usize) -> Vec { - if cnt == 0 { - return vec![]; - } - - let mut state = self.state.lock().unwrap(); - if state.is_empty() { - return vec![]; - } - - let mut victims = Vec::with_capacity(cnt); - for _ in 0..cnt { - let Some(entry) = state.pop_front() else { - break; - }; - victims.push(entry); - } - victims - } - - fn notify_insert(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - state.insert_front(*entry_id); - } -} - -/// The policy that implements the FIFO (First In, First Out) algorithm. -/// Oldest entries are evicted first. -#[derive(Debug, Default)] -pub struct FifoPolicy { - state: Mutex, -} - -impl FifoPolicy { - /// Create a new [`FifoPolicy`]. - pub fn new() -> Self { - Self { - state: Mutex::new(QueueState::default()), - } - } -} - -// SAFETY: Access to raw pointers is protected by the internal `Mutex`. -unsafe impl Send for FifoPolicy {} -unsafe impl Sync for FifoPolicy {} - -impl CachePolicy for FifoPolicy { - fn find_victim(&self, cnt: usize) -> Vec { - if cnt == 0 { - return vec![]; - } - - let mut state = self.state.lock().unwrap(); - if state.is_empty() { - return vec![]; - } - - let mut victims = Vec::with_capacity(cnt); - for _ in 0..cnt { - let Some(entry) = state.pop_front() else { - break; - }; - victims.push(entry); - } - victims - } - - fn notify_insert(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - state.insert_back(*entry_id); - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::cache::cached_batch::{CacheEntry, CachedBatchType}; - use crate::cache::utils::{EntryID, create_cache_store, create_test_arrow_array}; - - fn entry(id: usize) -> EntryID { - id.into() - } - - #[tokio::test] - async fn test_filo_advisor() { - let advisor = FiloPolicy::new(); - let store = create_cache_store(3100, Box::new(advisor)).await; - - let entry_id1 = EntryID::from(1); - let entry_id2 = EntryID::from(2); - let entry_id3 = EntryID::from(3); - - store.insert(entry_id1, create_test_arrow_array(100)).await; - - let data = store.index().get(&entry_id1).unwrap(); - assert!(matches!(data.as_ref(), CacheEntry::MemoryArrow(_))); - store.insert(entry_id2, create_test_arrow_array(100)).await; - store.insert(entry_id3, create_test_arrow_array(100)).await; - - let entry_id4: EntryID = EntryID::from(4); - store.insert(entry_id4, create_test_arrow_array(100)).await; - - assert!(store.index().get(&entry_id1).is_some()); - assert!(store.index().get(&entry_id2).is_some()); - assert!(store.index().get(&entry_id4).is_some()); - - let data = store.index().get(&entry_id3).unwrap(); - assert!(matches!(data.as_ref(), CacheEntry::DiskLiquid(_))); - } - - #[test] - fn test_filo_advise_empty() { - let policy = FiloPolicy::new(); - assert!(policy.find_victim(1).is_empty()); - } - - #[test] - fn test_filo_advise_order() { - let policy = FiloPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - - assert_eq!(policy.find_victim(1), vec![e2]); - assert_eq!(policy.find_victim(1), vec![e1]); - } - - #[test] - fn test_filo_reinsert_moves_to_front() { - let policy = FiloPolicy::new(); - let first = entry(1); - let second = entry(2); - - policy.notify_insert(&first, CachedBatchType::MemoryArrow); - policy.notify_insert(&second, CachedBatchType::MemoryArrow); - policy.notify_insert(&first, CachedBatchType::MemoryArrow); - - assert_eq!(policy.find_victim(1), vec![first]); - assert_eq!(policy.find_victim(1), vec![second]); - } - - #[test] - fn test_fifo_advise_empty() { - let policy = FifoPolicy::new(); - assert!(policy.find_victim(1).is_empty()); - } - - #[test] - fn test_fifo_advise_order() { - let policy = FifoPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - - assert_eq!(policy.find_victim(1), vec![e1]); - assert_eq!(policy.find_victim(1), vec![e2]); - } - - #[test] - fn test_fifo_reinsert_moves_to_back() { - let policy = FifoPolicy::new(); - let first = entry(1); - let second = entry(2); - - policy.notify_insert(&first, CachedBatchType::MemoryArrow); - policy.notify_insert(&second, CachedBatchType::MemoryArrow); - policy.notify_insert(&first, CachedBatchType::MemoryArrow); - - assert_eq!(policy.find_victim(1), vec![second]); - assert_eq!(policy.find_victim(1), vec![first]); - } -} diff --git a/src/core/src/cache/policies/cache/lru.rs b/src/core/src/cache/policies/cache/lru.rs deleted file mode 100644 index 794e8500..00000000 --- a/src/core/src/cache/policies/cache/lru.rs +++ /dev/null @@ -1,382 +0,0 @@ -//! LRU cache policy implementation using a hash map and doubly linked list. - -use std::{collections::HashMap, ptr::NonNull}; - -use crate::{ - cache::{cached_batch::CachedBatchType, utils::EntryID}, - sync::Mutex, -}; - -use super::{ - CachePolicy, - doubly_linked_list::{DoublyLinkedList, DoublyLinkedNode, drop_boxed_node}, -}; - -#[derive(Debug)] -struct LruNode { - entry_id: EntryID, -} - -type NodePtr = NonNull>; - -#[derive(Debug, Default)] -struct HashList { - map: HashMap, - list: DoublyLinkedList, -} - -impl HashList { - fn tail(&self) -> Option { - self.list.tail() - } - - unsafe fn move_to_front(&mut self, node_ptr: NodePtr) { - unsafe { self.list.move_to_front(node_ptr) }; - } - - unsafe fn push_front(&mut self, node_ptr: NodePtr) { - unsafe { self.list.push_front(node_ptr) }; - } - - unsafe fn remove_and_release(&mut self, node_ptr: NodePtr) { - unsafe { - self.list.unlink(node_ptr); - drop_boxed_node(node_ptr); - } - } -} - -impl Drop for HashList { - fn drop(&mut self) { - for (_, node_ptr) in self.map.drain() { - unsafe { - self.list.unlink(node_ptr); - drop_boxed_node(node_ptr); - } - } - // Any nodes not tracked in the map (shouldn't happen) get cleaned up here. - unsafe { - self.list.drop_all(); - } - } -} - -/// The policy that implement the LRU algorithm using a HashMap and a doubly linked list. -#[derive(Debug, Default)] -pub struct LruPolicy { - state: Mutex, -} - -impl LruPolicy { - /// Create a new [`LruPolicy`]. - pub fn new() -> Self { - Self { - state: Mutex::new(HashList::default()), - } - } -} - -// SAFETY: The Mutex ensures that only one thread accesses the internal state -// (hash map and intrusive list containing NonNull pointers) at a time, making it safe -// to send and share across threads. -unsafe impl Send for LruPolicy {} -unsafe impl Sync for LruPolicy {} - -impl CachePolicy for LruPolicy { - fn find_victim(&self, cnt: usize) -> Vec { - let mut state = self.state.lock().unwrap(); - if cnt == 0 { - return vec![]; - } - - let mut advices = Vec::with_capacity(cnt); - for _ in 0..cnt { - let Some(tail_ptr) = state.tail() else { - break; - }; - let tail_entry_id = unsafe { tail_ptr.as_ref().data.entry_id }; - let node_ptr = state - .map - .remove(&tail_entry_id) - .expect("tail node not found"); - unsafe { - state.remove_and_release(node_ptr); - } - advices.push(tail_entry_id); - } - - advices - } - - fn notify_access(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - if let Some(node_ptr) = state.map.get(entry_id).copied() { - unsafe { state.move_to_front(node_ptr) }; - } - } - - fn notify_insert(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - - if let Some(existing_node_ptr) = state.map.get(entry_id).copied() { - unsafe { state.move_to_front(existing_node_ptr) }; - return; - } - - let node = DoublyLinkedNode::new(LruNode { - entry_id: *entry_id, - }); - let node_ptr = NonNull::from(Box::leak(node)); - - state.map.insert(*entry_id, node_ptr); - unsafe { - state.push_front(node_ptr); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::cache::utils::{EntryID, create_cache_store, create_test_arrow_array}; - use crate::sync::{Arc, Barrier, thread}; - use std::sync::atomic::{AtomicUsize, Ordering}; - - fn entry(id: usize) -> EntryID { - id.into() - } - - fn assert_evict_advice(policy: &LruPolicy, expect_evict: EntryID) { - let advice = policy.find_victim(1); - assert_eq!(advice, vec![expect_evict]); - } - - #[test] - fn test_lru_policy_insertion_order() { - let policy = LruPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(3); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - assert_evict_advice(&policy, e1); - } - - #[test] - fn test_lru_policy_access_moves_to_front() { - let policy = LruPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(3); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - policy.notify_access(&e1, CachedBatchType::MemoryArrow); - assert_evict_advice(&policy, e2); - policy.notify_access(&e2, CachedBatchType::MemoryArrow); - assert_evict_advice(&policy, e3); - } - - #[test] - fn test_lru_policy_reinsert_moves_to_front() { - let policy = LruPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(3); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - assert_evict_advice(&policy, e2); - } - - #[test] - fn test_lru_policy_advise_empty() { - let policy = LruPolicy::new(); - assert_eq!(policy.find_victim(1), vec![]); - } - - #[test] - fn test_lru_policy_advise_single_item_self() { - let policy = LruPolicy::new(); - let e1 = entry(1); - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - - assert_evict_advice(&policy, e1); - } - - #[test] - fn test_lru_policy_advise_single_item_other() { - let policy = LruPolicy::new(); - let e1 = entry(1); - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - assert_evict_advice(&policy, e1); - } - - #[test] - fn test_lru_policy_access_nonexistent() { - let policy = LruPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - - policy.notify_access(&entry(99), CachedBatchType::MemoryArrow); - - assert_evict_advice(&policy, e1); - } - - impl HashList { - fn check_integrity(&self) { - let map_count = self.map.len(); - let forward_count = count_nodes_in_list(self); - let backward_count = count_nodes_reverse(self); - - assert_eq!(map_count, forward_count); - assert_eq!(map_count, backward_count); - } - } - - fn count_nodes_in_list(state: &HashList) -> usize { - let mut count = 0; - let mut current = state.list.head(); - - while let Some(node_ptr) = current { - count += 1; - current = unsafe { node_ptr.as_ref().next }; - } - - count - } - - fn count_nodes_reverse(state: &HashList) -> usize { - let mut count = 0; - let mut current = state.list.tail(); - - while let Some(node_ptr) = current { - count += 1; - current = unsafe { node_ptr.as_ref().prev }; - } - - count - } - - #[test] - fn test_lru_policy_invariants() { - let policy = LruPolicy::new(); - - for i in 0..10 { - policy.notify_insert(&entry(i), CachedBatchType::MemoryArrow); - } - policy.notify_access(&entry(2), CachedBatchType::MemoryArrow); - policy.notify_access(&entry(5), CachedBatchType::MemoryArrow); - policy.find_victim(1); - policy.find_victim(1); - - let state = policy.state.lock().unwrap(); - state.check_integrity(); - - let map_count = state.map.len(); - assert_eq!(map_count, 8); - assert!(!state.map.contains_key(&entry(0))); - assert!(!state.map.contains_key(&entry(1))); - assert!(state.map.contains_key(&entry(2))); - - let head_id = unsafe { state.list.head().unwrap().as_ref().data.entry_id }; - assert_eq!(head_id, entry(5)); - } - - #[test] - fn test_concurrent_lru_operations() { - concurrent_lru_operations(); - } - - #[cfg(feature = "shuttle")] - #[test] - fn shuttle_lru_operations() { - crate::utils::shuttle_test(concurrent_lru_operations); - } - - fn concurrent_lru_operations() { - let policy = Arc::new(LruPolicy::new()); - let num_threads = 4; - let operations_per_thread = 100; - - let total_inserts = Arc::new(AtomicUsize::new(0)); - let total_evictions = Arc::new(AtomicUsize::new(0)); - - let barrier = Arc::new(Barrier::new(num_threads)); - - let mut handles = vec![]; - for thread_id in 0..num_threads { - let policy_clone = policy.clone(); - let total_inserts_clone = total_inserts.clone(); - let total_evictions_clone = total_evictions.clone(); - let barrier_clone = barrier.clone(); - - let handle = thread::spawn(move || { - barrier_clone.wait(); - - for i in 0..operations_per_thread { - let op_type = i % 3; - let entry_id = entry(thread_id * operations_per_thread + i); - - match op_type { - 0 => { - policy_clone.notify_insert(&entry_id, CachedBatchType::MemoryArrow); - total_inserts_clone.fetch_add(1, Ordering::SeqCst); - } - 1 => { - policy_clone.notify_access(&entry_id, CachedBatchType::MemoryArrow); - } - _ => { - let advised = policy_clone.find_victim(1); - if !advised.is_empty() { - total_evictions_clone.fetch_add(1, Ordering::SeqCst); - } - } - } - } - }); - - handles.push(handle); - } - - for handle in handles { - handle.join().unwrap(); - } - - let state = policy.state.lock().unwrap(); - state.check_integrity(); - - let inserts = total_inserts.load(Ordering::SeqCst); - let evictions = total_evictions.load(Ordering::SeqCst); - assert!(inserts >= evictions); - } - - #[tokio::test] - async fn test_lru_integration() { - let policy = LruPolicy::new(); - let store = create_cache_store(3000, Box::new(policy)).await; - - let entry_id1 = EntryID::from(1); - let entry_id2 = EntryID::from(2); - let entry_id3 = EntryID::from(3); - - store.insert(entry_id1, create_test_arrow_array(100)).await; - store.insert(entry_id2, create_test_arrow_array(100)).await; - store.insert(entry_id3, create_test_arrow_array(100)).await; - - assert!(store.index().get(&entry_id1).is_some()); - assert!(store.index().get(&entry_id2).is_some()); - assert!(store.index().get(&entry_id3).is_some()); - } -} diff --git a/src/core/src/cache/policies/cache/mod.rs b/src/core/src/cache/policies/cache/mod.rs index a253d1e3..352f83a0 100644 --- a/src/core/src/cache/policies/cache/mod.rs +++ b/src/core/src/cache/policies/cache/mod.rs @@ -3,32 +3,29 @@ use crate::cache::cached_batch::CachedBatchType; use crate::cache::utils::EntryID; -mod clock; mod doubly_linked_list; -mod filo; -mod lru; -mod s3_fifo; -mod sieve; mod three_queue; -pub use clock::ClockPolicy; -pub use filo::FifoPolicy; -pub use filo::FiloPolicy; -pub use lru::LruPolicy; -pub use s3_fifo::S3FifoPolicy; -pub use sieve::SievePolicy; pub use three_queue::LiquidPolicy; /// The cache policy that guides the replacement of LiquidCache pub trait CachePolicy: std::fmt::Debug + Send + Sync { /// Give cnt amount of entries to evict when cache is full. - fn find_victim(&self, cnt: usize) -> Vec; + fn find_memory_victim(&self, cnt: usize) -> Vec; + + /// Give cnt amount of disk entries to remove when disk is full. + fn find_disk_victim(&self, _cnt: usize) -> Vec { + vec![] + } /// Notify the cache policy that an entry was inserted. fn notify_insert(&self, _entry_id: &EntryID, _batch_type: CachedBatchType) {} /// Notify the cache policy that an entry was accessed. fn notify_access(&self, _entry_id: &EntryID, _batch_type: CachedBatchType) {} + + /// Notify the cache policy that an entry was removed. + fn notify_remove(&self, _entry_id: &EntryID) {} } #[cfg(test)] @@ -56,7 +53,7 @@ mod tests { let advised_entries_clone = advised_entries.clone(); let handle = thread::spawn(move || { - let advice = policy_clone.find_victim(1); + let advice = policy_clone.find_memory_victim(1); if let Some(entry_id) = advice.first() { let mut entries = advised_entries_clone.lock().unwrap(); entries.push(*entry_id); @@ -83,8 +80,7 @@ mod tests { } fn run_concurrent_invariant_tests() { - concurrent_invariant_advice_once(Arc::new(LruPolicy::new())); - concurrent_invariant_advice_once(Arc::new(FiloPolicy::new())); + concurrent_invariant_advice_once(Arc::new(LiquidPolicy::new())); } #[test] diff --git a/src/core/src/cache/policies/cache/s3_fifo.rs b/src/core/src/cache/policies/cache/s3_fifo.rs deleted file mode 100644 index fbd88278..00000000 --- a/src/core/src/cache/policies/cache/s3_fifo.rs +++ /dev/null @@ -1,379 +0,0 @@ -//! S3 FIFO cache policy implementation - -use std::collections::{HashMap, HashSet, VecDeque}; -use std::fmt; -use std::sync::Mutex; - -use crate::cache::CachePolicy; -use crate::cache::{cached_batch::CachedBatchType, utils::EntryID}; - -type EntryFreq = u8; - -#[derive(Debug, Default)] -struct S3FifoInternalState { - small: VecDeque, - main: VecDeque, - ghost: VecDeque, - ghost_set: HashSet, - - frequency: HashMap, - - small_queue_size: usize, - main_queue_size: usize, - total_size: usize, -} - -impl S3FifoInternalState { - fn cap_frequency(freq: u8) -> u8 { - std::cmp::min(freq, 3) - } - - fn inc_frequency(&mut self, entry_id: &EntryID) { - if let Some(freq) = self.frequency.get_mut(entry_id) { - *freq = Self::cap_frequency(*freq + 1); - } - } - - fn dec_frequency(&mut self, entry_id: &EntryID) { - if let Some(freq) = self.frequency.get_mut(entry_id) { - *freq = freq.saturating_sub(1); - } - } - - fn inc_small_queue_size(&mut self, size: usize) { - self.small_queue_size += size; - self.total_size += size; - } - - fn dec_small_queue_size(&mut self, size: usize) { - self.small_queue_size -= size; - self.total_size -= size; - } - - fn inc_main_queue_size(&mut self, size: usize) { - self.main_queue_size += size; - self.total_size += size; - } - - fn dec_main_queue_size(&mut self, size: usize) { - self.main_queue_size -= size; - self.total_size -= size; - } - - fn small_queue_fraction(&self) -> f32 { - if self.total_size == 0 { - 0.0 - } else { - self.small_queue_size as f32 / self.total_size as f32 - } - } - - fn check_if_entry_exists_in_small_or_main(&self, entry_id: &EntryID) -> bool { - self.frequency.contains_key(entry_id) && !self.ghost_set.contains(entry_id) - } -} - -/// The policy that implements object size aware S3Fifo algorithm using Deque. -#[derive(Default)] -pub struct S3FifoPolicy { - state: Mutex, -} - -impl fmt::Debug for S3FifoPolicy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("S3FifoPolicy") - .field("state", &self.state) - .finish() - } -} - -unsafe impl Send for S3FifoPolicy {} -unsafe impl Sync for S3FifoPolicy {} - -impl S3FifoPolicy { - /// Create a new [`S3FifoPolicy`]. - pub fn new() -> Self { - Self { - state: Mutex::new(S3FifoInternalState::default()), - } - } - - fn entry_size(&self, _entry_id: &EntryID) -> usize { - 1 - } - - fn evict_from_small(&self, state: &mut S3FifoInternalState) -> Option { - let mut is_evicted = false; - let mut victim: Option = None; - - while !is_evicted && !state.small.is_empty() { - let Some(element) = state.small.pop_back() else { - break; - }; - let freq = state.frequency.get(&element).copied().unwrap_or(0); - let entry_size = self.entry_size(&element); - state.dec_small_queue_size(entry_size); - - if freq > 1 { - state.main.push_front(element); - state.inc_main_queue_size(entry_size); - state.frequency.insert(element, 0); - } else { - // Move to ghost queue - state.ghost.push_front(element); - state.ghost_set.insert(element); - state.frequency.remove(&element); - is_evicted = true; - victim = Some(element); - } - } - victim - } - - fn evict_from_main(&self, state: &mut S3FifoInternalState) -> Option { - let mut is_evicted = false; - let mut victim: Option = None; - - while !is_evicted && !state.main.is_empty() { - let Some(element) = state.main.pop_back() else { - break; - }; - - let freq = state.frequency.get(&element).copied().unwrap_or(0); - let entry_size = self.entry_size(&element); - state.dec_main_queue_size(entry_size); - - if freq > 0 { - state.main.push_front(element); - state.dec_frequency(&element); - state.inc_main_queue_size(entry_size); - } else { - state.frequency.remove(&element); - is_evicted = true; - victim = Some(element); - } - } - victim - } -} - -impl CachePolicy for S3FifoPolicy { - fn find_victim(&self, cnt: usize) -> Vec { - let mut state = self.state.lock().unwrap(); - let mut advices = Vec::with_capacity(cnt); - let threshold_for_small_eviction = 0.1; - while advices.len() < cnt && state.total_size > 0 { - let victim = if !state.small.is_empty() - && state.small_queue_fraction() >= threshold_for_small_eviction - { - self.evict_from_small(&mut state) - } else { - self.evict_from_main(&mut state) - }; - - if let Some(v) = victim { - advices.push(v); - } - } - advices - } - - fn notify_insert(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - let entry_size = self.entry_size(entry_id); - - if state.check_if_entry_exists_in_small_or_main(entry_id) { - state.inc_frequency(entry_id); - } else if state.ghost_set.contains(entry_id) { - state.ghost_set.remove(entry_id); - state.ghost.retain(|x| *x != *entry_id); - state.main.push_front(*entry_id); - state.inc_main_queue_size(entry_size); - } else { - state.small.push_front(*entry_id); - state.inc_small_queue_size(entry_size); - state.frequency.insert(*entry_id, 0); - } - } - - fn notify_access(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - if state.check_if_entry_exists_in_small_or_main(entry_id) { - state.inc_frequency(entry_id); - } - } -} - -impl Drop for S3FifoPolicy { - fn drop(&mut self) { - let mut state = self.state.lock().unwrap(); - state.small.clear(); - state.main.clear(); - state.ghost.clear(); - state.ghost_set.clear(); - state.frequency.clear(); - state.total_size = 0; - state.small_queue_size = 0; - state.main_queue_size = 0; - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::cache::utils::EntryID; - - fn entry(id: usize) -> EntryID { - id.into() - } - - #[test] - fn test_s3fifo_basic_insert_eviction() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(3); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - let evicted = policy.find_victim(1); - assert_eq!(evicted.len(), 1); - } - - #[test] - fn test_s3fifo_frequency_increase() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_access(&e1, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - assert_eq!(*state.frequency.get(&e1).unwrap(), 1); - } - - #[test] - fn test_s3fifo_eviction_order() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - - let evicted = policy.find_victim(1); - assert_eq!(evicted[0], e1); - } - - #[test] - fn test_s3fifo_ghost_promote() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - let evicted = policy.find_victim(1); - assert_eq!(evicted[0], e1); - - // Re-insert evicted entry from ghost - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - let state = policy.state.lock().unwrap(); - assert!(state.main.contains(&e1)); - assert!(!state.ghost_set.contains(&e1)); - } - - #[test] - fn test_s3fifo_size_aware_fraction() { - let policy = S3FifoPolicy::new(); - let e1 = entry(20); - let e2 = entry(30); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - assert_eq!(state.small_queue_size, 2); - assert_eq!(state.small_queue_fraction(), 1.0); - } - - #[test] - fn test_insert_and_access_updates_freq() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_access(&e1, CachedBatchType::MemoryArrow); - policy.notify_access(&e1, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - assert_eq!(*state.frequency.get(&e1).unwrap(), 2); // capped at 3 - } - - #[test] - fn test_freq_cap_at_three() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - for _ in 0..10 { - policy.notify_access(&e1, CachedBatchType::MemoryArrow); - } - - let state = policy.state.lock().unwrap(); - assert_eq!(*state.frequency.get(&e1).unwrap(), 3); - } - - #[test] - fn test_eviction_from_s_to_ghost() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - let evicted = policy.find_victim(1); - - assert_eq!(evicted[0], e1); - let state = policy.state.lock().unwrap(); - assert!(state.ghost_set.contains(&e1)); - assert!(state.ghost.contains(&e1)); - } - - #[test] - fn test_eviction_from_main_and_reinsertion_logic() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - let _e3 = entry(3); - - let mut state = policy.state.lock().unwrap(); - state.main.push_front(e1); - state.frequency.insert(e1, 1); - state.main_queue_size += 1; - - state.main.push_front(e2); - state.frequency.insert(e2, 2); - state.main_queue_size += 1; - state.total_size += 2; - drop(state); - - let evicted = policy.find_victim(2); - assert_eq!(evicted.len(), 2); - } - - #[test] - fn test_s3fifo_reinsert_does_not_duplicate_entry() { - let policy = S3FifoPolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - let occurrences = state.small.iter().filter(|&&id| id == e1).count() - + state.main.iter().filter(|&&id| id == e1).count(); - assert_eq!(occurrences, 1); - assert_eq!(*state.frequency.get(&e1).unwrap(), 1); - } -} diff --git a/src/core/src/cache/policies/cache/sieve.rs b/src/core/src/cache/policies/cache/sieve.rs deleted file mode 100644 index 6fe54638..00000000 --- a/src/core/src/cache/policies/cache/sieve.rs +++ /dev/null @@ -1,295 +0,0 @@ -//! SIEVE cache policy implementation. - -use std::{collections::HashMap, fmt, ptr::NonNull}; - -use crate::{ - cache::{cached_batch::CachedBatchType, utils::EntryID}, - sync::Mutex, -}; - -use super::{ - CachePolicy, - doubly_linked_list::{DoublyLinkedList, DoublyLinkedNode, drop_boxed_node}, -}; - -#[derive(Debug)] -struct SieveNode { - entry_id: EntryID, - visited: bool, -} - -type NodePtr = NonNull>; - -#[derive(Debug, Default)] -struct SieveInternalState { - map: HashMap, - list: DoublyLinkedList, - hand: Option, - total_size: usize, -} - -/// The policy that implements object size aware SIEVE algorithm using a HashMap and a doubly linked list. -#[derive(Default)] -pub struct SievePolicy { - state: Mutex, -} - -impl fmt::Debug for SievePolicy { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SievePolicy") - .field("state", &self.state) - .finish() - } -} - -impl SievePolicy { - /// Create a new [`SievePolicy`]. - pub fn new() -> Self { - Self { - state: Mutex::new(SieveInternalState::default()), - } - } - - fn entry_size(&self, _entry_id: &EntryID) -> usize { - 1 - } -} - -unsafe impl Send for SievePolicy {} -unsafe impl Sync for SievePolicy {} - -impl CachePolicy for SievePolicy { - fn find_victim(&self, cnt: usize) -> Vec { - let mut state = self.state.lock().unwrap(); - let mut advices = Vec::with_capacity(cnt); - for _ in 0..cnt { - let hand_ptr = match state.hand { - Some(ptr) => Some(ptr), - None => state.list.tail(), - }; - let mut hand_ptr = match hand_ptr { - Some(p) => p, - None => break, - }; - loop { - if unsafe { hand_ptr.as_ref() }.data.visited { - unsafe { hand_ptr.as_mut() }.data.visited = false; - let prev = unsafe { hand_ptr.as_ref().prev }; - let next_hand = prev - .or(state.list.tail()) - .expect("non-empty list must have a tail"); - hand_ptr = next_hand; - state.hand = Some(next_hand); - } else { - let victim_id = unsafe { hand_ptr.as_ref().data.entry_id }; - let prev = unsafe { hand_ptr.as_ref().prev }; - let node_ptr = state.map.remove(&victim_id).unwrap(); - unsafe { - state.list.unlink(node_ptr); - drop_boxed_node(node_ptr); - } - state.total_size -= self.entry_size(&victim_id); - advices.push(victim_id); - state.hand = prev.or(state.list.tail()); - break; - } - } - } - advices - } - - fn notify_insert(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let mut state = self.state.lock().unwrap(); - if state.map.contains_key(entry_id) { - if let Some(mut node_ptr) = state.map.get(entry_id).copied() { - unsafe { - node_ptr.as_mut().data.visited = true; - } - } - return; - } - - let was_empty = state.list.head().is_none(); - let node = DoublyLinkedNode::new(SieveNode { - entry_id: *entry_id, - visited: false, - }); - let node_ptr = NonNull::from(Box::leak(node)); - state.map.insert(*entry_id, node_ptr); - unsafe { - state.list.push_front(node_ptr); - } - if was_empty { - state.hand = Some(node_ptr); - } - state.total_size += self.entry_size(entry_id); - } - - fn notify_access(&self, entry_id: &EntryID, _batch_type: CachedBatchType) { - let state = self.state.lock().unwrap(); - if let Some(mut node_ptr) = state.map.get(entry_id).copied() { - unsafe { - node_ptr.as_mut().data.visited = true; - } - } - } -} - -impl Drop for SievePolicy { - fn drop(&mut self) { - let mut state = self.state.lock().unwrap(); - let handles: Vec<_> = state.map.drain().map(|(_, ptr)| ptr).collect(); - for node_ptr in handles { - unsafe { - state.list.unlink(node_ptr); - drop_boxed_node(node_ptr); - } - } - unsafe { - state.list.drop_all(); - } - state.hand = None; - state.total_size = 0; - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::cache::{ - cached_batch::CachedBatchType, - utils::{EntryID, create_cache_store, create_test_arrow_array}, - }; - - fn entry(id: usize) -> EntryID { - id.into() - } - - fn assert_evict_advice(policy: &SievePolicy, expect_evict: EntryID) { - let advice = policy.find_victim(1); - assert_eq!(advice, vec![expect_evict]); - } - - #[test] - fn test_sieve_insert_order() { - let policy = SievePolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(3); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - assert_evict_advice(&policy, e1); - } - - #[test] - fn test_sieve_access_sets_visited() { - let policy = SievePolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(3); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - policy.notify_access(&e1, CachedBatchType::MemoryArrow); - assert_evict_advice(&policy, e2); - } - - #[test] - fn test_sieve_reinsert_marks_visited() { - let policy = SievePolicy::new(); - let e1 = entry(1); - let e2 = entry(2); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - - assert_evict_advice(&policy, e2); - } - - #[test] - fn test_sieve_reinsert_sets_visited_flag() { - let policy = SievePolicy::new(); - let e1 = entry(1); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - - { - let state = policy.state.lock().unwrap(); - let mut node_ptr = state.map.get(&e1).copied().unwrap(); - unsafe { - node_ptr.as_mut().data.visited = false; - } - } - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - let node_ptr = state.map.get(&e1).copied().unwrap(); - unsafe { - assert!(node_ptr.as_ref().data.visited); - } - assert_eq!(state.map.len(), 1); - } - - #[test] - fn test_sieve_advise_empty() { - let policy = SievePolicy::new(); - assert_eq!(policy.find_victim(1), vec![]); - } - - #[test] - fn test_sieve_with_sizeof_closure_defined() { - let policy = SievePolicy::new(); - - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(11); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - assert_eq!(state.total_size, 3); - } - - #[test] - fn test_sieve_sizeof_without_closure() { - let policy = SievePolicy::new(); - - let e1 = entry(1); - let e2 = entry(2); - let e3 = entry(11); - - policy.notify_insert(&e1, CachedBatchType::MemoryArrow); - policy.notify_insert(&e2, CachedBatchType::MemoryArrow); - policy.notify_insert(&e3, CachedBatchType::MemoryArrow); - - let state = policy.state.lock().unwrap(); - assert_eq!(state.total_size, 3); - } - - #[tokio::test] - async fn test_sieve_integration() { - let advisor = SievePolicy::new(); - let store = create_cache_store(3000, Box::new(advisor)).await; - - let entry_id1 = EntryID::from(1); - let entry_id2 = EntryID::from(2); - let entry_id3 = EntryID::from(3); - - store.insert(entry_id1, create_test_arrow_array(100)).await; - store.insert(entry_id2, create_test_arrow_array(100)).await; - store.insert(entry_id3, create_test_arrow_array(100)).await; - assert!(store.index().get(&entry_id1).is_some()); - assert!(store.index().get(&entry_id2).is_some()); - assert!(store.index().get(&entry_id3).is_some()); - } -} diff --git a/src/core/src/cache/policies/cache/three_queue.rs b/src/core/src/cache/policies/cache/three_queue.rs index 57ff6a4b..655c1984 100644 --- a/src/core/src/cache/policies/cache/three_queue.rs +++ b/src/core/src/cache/policies/cache/three_queue.rs @@ -97,6 +97,16 @@ impl LiquidQueueInternalState { } Some(entry_id) } + + fn remove(&mut self, entry_id: &EntryID) -> Option { + let node_ptr = self.map.remove(entry_id)?; + let removed = unsafe { node_ptr.as_ref().data.entry_id }; + unsafe { + self.detach(node_ptr); + drop_boxed_node(node_ptr); + } + Some(removed) + } } impl Drop for LiquidQueueInternalState { @@ -155,7 +165,7 @@ impl CachePolicy for LiquidPolicy { inner.upsert_into_queue(*entry_id, target); } - fn find_victim(&self, cnt: usize) -> Vec { + fn find_memory_victim(&self, cnt: usize) -> Vec { if cnt == 0 { return vec![]; } @@ -185,7 +195,30 @@ impl CachePolicy for LiquidPolicy { victims } + fn find_disk_victim(&self, cnt: usize) -> Vec { + if cnt == 0 { + return vec![]; + } + + let mut inner = self.inner.lock().unwrap(); + let mut victims = Vec::with_capacity(cnt); + + while victims.len() < cnt { + let Some(entry) = inner.pop_front(QueueKind::Disk) else { + break; + }; + victims.push(entry); + } + + victims + } + fn notify_access(&self, _entry_id: &EntryID, _batch_type: CachedBatchType) {} + + fn notify_remove(&self, entry_id: &EntryID) { + let mut inner = self.inner.lock().unwrap(); + inner.remove(entry_id); + } } #[cfg(test)] @@ -211,9 +244,9 @@ mod tests { policy.notify_insert(&liquid_a, CachedBatchType::MemoryLiquid); policy.notify_insert(&liquid_b, CachedBatchType::MemoryLiquid); - assert_eq!(policy.find_victim(1), vec![arrow_a]); - assert_eq!(policy.find_victim(2), vec![arrow_b, liquid_a]); - assert_eq!(policy.find_victim(1), vec![liquid_b]); + assert_eq!(policy.find_memory_victim(1), vec![arrow_a]); + assert_eq!(policy.find_memory_victim(2), vec![arrow_b, liquid_a]); + assert_eq!(policy.find_memory_victim(1), vec![liquid_b]); } #[test] @@ -229,7 +262,7 @@ mod tests { policy.notify_insert(&arrow_entry, CachedBatchType::MemoryArrow); // Request more victims than available to ensure we only get what exists. - let victims = policy.find_victim(5); + let victims = policy.find_memory_victim(5); assert_eq!(victims, vec![arrow_entry, liquid_entry, hybrid_entry]); } @@ -238,7 +271,7 @@ mod tests { let policy = LiquidPolicy::new(); policy.notify_insert(&entry(1), CachedBatchType::MemoryArrow); - assert!(policy.find_victim(0).is_empty()); + assert!(policy.find_memory_victim(0).is_empty()); } #[test] @@ -253,11 +286,25 @@ mod tests { policy.notify_insert(&arrow_entry, CachedBatchType::MemoryArrow); policy.notify_insert(&liquid_entry, CachedBatchType::MemoryLiquid); - let victims = policy.find_victim(5); + let victims = policy.find_memory_victim(5); assert_eq!(victims, vec![arrow_entry, liquid_entry]); // Only the disk entry remains and should still not be evicted. - assert!(policy.find_victim(1).is_empty()); + assert!(policy.find_memory_victim(1).is_empty()); + } + + #[test] + fn test_disk_victims_and_remove() { + let policy = LiquidPolicy::new(); + let disk_old = entry(1); + let disk_new = entry(2); + + policy.notify_insert(&disk_old, CachedBatchType::DiskArrow); + policy.notify_insert(&disk_new, CachedBatchType::DiskLiquid); + + assert_eq!(policy.find_disk_victim(1), vec![disk_old]); + policy.notify_remove(&disk_new); + assert!(policy.find_disk_victim(1).is_empty()); } #[test] @@ -273,8 +320,8 @@ mod tests { // Reinserting should refresh the entry as the newest arrow batch. policy.notify_insert(&first, CachedBatchType::MemoryArrow); - assert_eq!(policy.find_victim(1), vec![second]); - assert_eq!(policy.find_victim(1), vec![first]); + assert_eq!(policy.find_memory_victim(1), vec![second]); + assert_eq!(policy.find_memory_victim(1), vec![first]); } #[test] @@ -286,7 +333,7 @@ mod tests { policy.notify_insert(&entry_id, CachedBatchType::MemoryArrow); policy.notify_insert(&entry_id, CachedBatchType::MemoryLiquid); - let victims = policy.find_victim(2); + let victims = policy.find_memory_victim(2); assert_eq!(victims, vec![entry_id]); } } diff --git a/src/core/src/cache/policies/hydration.rs b/src/core/src/cache/policies/hydration.rs index e2448ebf..ea96175d 100644 --- a/src/core/src/cache/policies/hydration.rs +++ b/src/core/src/cache/policies/hydration.rs @@ -87,6 +87,7 @@ fn hydrate_variant_paths( combined_values, nulls, squeezed.original_arrow_data_type(), + squeezed.disk_backing().disk_bytes(), ); Some(CacheEntry::memory_squeezed_liquid( Arc::new(merged) as LiquidSqueezedArrayRef @@ -96,16 +97,27 @@ fn hydrate_variant_paths( impl HydrationPolicy for AlwaysHydrate { fn hydrate(&self, request: &HydrationRequest<'_>) -> Option { match (request.cached, &request.materialized) { - (CacheEntry::DiskArrow(_), MaterializedEntry::Arrow(arr)) => { + (CacheEntry::DiskArrow { disk_bytes, .. }, MaterializedEntry::Arrow(arr)) => { if let Some(CacheExpression::VariantGet { requests }) = request.expression && let Some((squeezed, _bytes)) = try_variant_squeeze(arr, requests, request.compressor.as_ref()) { - return Some(CacheEntry::memory_squeezed_liquid(squeezed)); + let variant = squeezed + .as_any() + .downcast_ref::()?; + let squeezed = VariantStructSqueezedArray::new( + variant.typed_values(), + variant.nulls(), + variant.original_arrow_data_type(), + *disk_bytes, + ); + return Some(CacheEntry::memory_squeezed_liquid( + Arc::new(squeezed) as LiquidSqueezedArrayRef + )); } Some(CacheEntry::memory_arrow((*arr).clone())) } - (CacheEntry::DiskLiquid(_), MaterializedEntry::Liquid(liq)) => { + (CacheEntry::DiskLiquid { .. }, MaterializedEntry::Liquid(liq)) => { Some(CacheEntry::memory_liquid((*liq).clone())) } (CacheEntry::MemoryLiquid(_), _) => None, @@ -169,7 +181,7 @@ mod tests { let expr = CacheExpression::variant_get("age", DataType::Int64); let policy = AlwaysHydrate::new(); let compressor = Arc::new(LiquidCompressorStates::new()); - let cached_entry = CacheEntry::disk_arrow(arr.data_type().clone()); + let cached_entry = CacheEntry::disk_arrow(arr.data_type().clone(), 1); let hydrated = policy.hydrate(&HydrationRequest { entry_id: EntryID::from(0), diff --git a/src/core/src/cache/policies/squeeze.rs b/src/core/src/cache/policies/squeeze.rs index 21e09eff..53f48913 100644 --- a/src/core/src/cache/policies/squeeze.rs +++ b/src/core/src/cache/policies/squeeze.rs @@ -18,16 +18,29 @@ use crate::liquid_array::{ use crate::utils::VariantSchema; /// What to do when we need to squeeze an entry? +#[derive(Debug, Clone)] +pub enum SqueezeOutcome { + /// Replace the cache entry, optionally writing bytes to disk first. + Replace { + /// Replacement cache entry. + entry: CacheEntry, + /// Bytes that must be written before inserting the replacement. + bytes_to_write: Option, + }, + /// Remove the entry entirely. + Remove, +} + +/// Policy that chooses the next representation for an entry under memory pressure. pub trait SqueezePolicy: std::fmt::Debug + Send + Sync { /// Squeeze the entry. - /// Returns the squeezed entry and the bytes that were used to store the entry on disk. fn squeeze( &self, entry: &CacheEntry, compressor: &LiquidCompressorStates, squeeze_hint: Option<&CacheExpression>, squeeze_io: &Arc, - ) -> (CacheEntry, Option); + ) -> SqueezeOutcome; } /// Squeeze the entry to disk. @@ -41,31 +54,37 @@ impl SqueezePolicy for Evict { _compressor: &LiquidCompressorStates, _squeeze_hint: Option<&CacheExpression>, _squeeze_io: &Arc, - ) -> (CacheEntry, Option) { + ) -> SqueezeOutcome { match entry { CacheEntry::MemoryArrow(array) => { let bytes = arrow_to_bytes(array).expect("failed to convert arrow to bytes"); - ( - CacheEntry::disk_arrow(array.data_type().clone()), - Some(bytes), - ) + SqueezeOutcome::Replace { + entry: CacheEntry::disk_arrow(array.data_type().clone(), bytes.len()), + bytes_to_write: Some(bytes), + } } CacheEntry::MemoryLiquid(liquid_array) => { let disk_data = liquid_array.to_bytes(); - ( - CacheEntry::disk_liquid(liquid_array.original_arrow_data_type()), - Some(Bytes::from(disk_data)), - ) + SqueezeOutcome::Replace { + entry: CacheEntry::disk_liquid( + liquid_array.original_arrow_data_type(), + disk_data.len(), + ), + bytes_to_write: Some(Bytes::from(disk_data)), + } } CacheEntry::MemorySqueezedLiquid(squeezed_array) => { let data_type = squeezed_array.original_arrow_data_type(); let new_entry = match squeezed_array.disk_backing() { - SqueezedBacking::Liquid => CacheEntry::disk_liquid(data_type), - SqueezedBacking::Arrow => CacheEntry::disk_arrow(data_type), + SqueezedBacking::Liquid(n) => CacheEntry::disk_liquid(data_type, n), + SqueezedBacking::Arrow(n) => CacheEntry::disk_arrow(data_type, n), }; - (new_entry, None) + SqueezeOutcome::Replace { + entry: new_entry, + bytes_to_write: None, + } } - CacheEntry::DiskLiquid(_) | CacheEntry::DiskArrow(_) => (entry.clone(), None), + CacheEntry::DiskLiquid { .. } | CacheEntry::DiskArrow { .. } => SqueezeOutcome::Remove, } } } @@ -81,7 +100,7 @@ impl SqueezePolicy for TranscodeSqueezeEvict { compressor: &LiquidCompressorStates, squeeze_hint: Option<&CacheExpression>, squeeze_io: &Arc, - ) -> (CacheEntry, Option) { + ) -> SqueezeOutcome { match entry { CacheEntry::MemoryArrow(array) => { if let Some(requests) = @@ -89,20 +108,23 @@ impl SqueezePolicy for TranscodeSqueezeEvict { && let Some((squeezed_array, bytes)) = try_variant_squeeze(array, requests, compressor) { - return ( - CacheEntry::memory_squeezed_liquid(squeezed_array), - Some(bytes), - ); + return SqueezeOutcome::Replace { + entry: CacheEntry::memory_squeezed_liquid(squeezed_array), + bytes_to_write: Some(bytes), + }; } match transcode_liquid_inner_with_hint(array, compressor, squeeze_hint) { - Ok(liquid_array) => (CacheEntry::memory_liquid(liquid_array), None), + Ok(liquid_array) => SqueezeOutcome::Replace { + entry: CacheEntry::memory_liquid(liquid_array), + bytes_to_write: None, + }, Err(_) => { let bytes = arrow_to_bytes(array).expect("failed to convert arrow to bytes"); - ( - CacheEntry::disk_arrow(array.data_type().clone()), - Some(bytes), - ) + SqueezeOutcome::Replace { + entry: CacheEntry::disk_arrow(array.data_type().clone(), bytes.len()), + bytes_to_write: Some(bytes), + } } } } @@ -112,26 +134,32 @@ impl SqueezePolicy for TranscodeSqueezeEvict { Some(result) => result, None => { let bytes = Bytes::from(liquid_array.to_bytes()); - return ( - CacheEntry::disk_liquid(liquid_array.original_arrow_data_type()), - Some(bytes), - ); + return SqueezeOutcome::Replace { + entry: CacheEntry::disk_liquid( + liquid_array.original_arrow_data_type(), + bytes.len(), + ), + bytes_to_write: Some(bytes), + }; } }; - ( - CacheEntry::memory_squeezed_liquid(squeezed_array), - Some(bytes), - ) + SqueezeOutcome::Replace { + entry: CacheEntry::memory_squeezed_liquid(squeezed_array), + bytes_to_write: Some(bytes), + } } CacheEntry::MemorySqueezedLiquid(squeezed_array) => { let data_type = squeezed_array.original_arrow_data_type(); let new_entry = match squeezed_array.disk_backing() { - SqueezedBacking::Liquid => CacheEntry::disk_liquid(data_type), - SqueezedBacking::Arrow => CacheEntry::disk_arrow(data_type), + SqueezedBacking::Liquid(n) => CacheEntry::disk_liquid(data_type, n), + SqueezedBacking::Arrow(n) => CacheEntry::disk_arrow(data_type, n), }; - (new_entry, None) + SqueezeOutcome::Replace { + entry: new_entry, + bytes_to_write: None, + } } - CacheEntry::DiskLiquid(_) | CacheEntry::DiskArrow(_) => (entry.clone(), None), + CacheEntry::DiskLiquid { .. } | CacheEntry::DiskArrow { .. } => SqueezeOutcome::Remove, } } } @@ -147,37 +175,46 @@ impl SqueezePolicy for TranscodeEvict { compressor: &LiquidCompressorStates, _squeeze_hint: Option<&CacheExpression>, _squeeze_io: &Arc, - ) -> (CacheEntry, Option) { + ) -> SqueezeOutcome { match entry { CacheEntry::MemoryArrow(array) => { match transcode_liquid_inner_with_hint(array, compressor, None) { - Ok(liquid_array) => (CacheEntry::memory_liquid(liquid_array), None), + Ok(liquid_array) => SqueezeOutcome::Replace { + entry: CacheEntry::memory_liquid(liquid_array), + bytes_to_write: None, + }, Err(_) => { let bytes = arrow_to_bytes(array).expect("failed to convert arrow to bytes"); - ( - CacheEntry::disk_arrow(array.data_type().clone()), - Some(bytes), - ) + SqueezeOutcome::Replace { + entry: CacheEntry::disk_arrow(array.data_type().clone(), bytes.len()), + bytes_to_write: Some(bytes), + } } } } CacheEntry::MemoryLiquid(liquid_array) => { let bytes = Bytes::from(liquid_array.to_bytes()); - ( - CacheEntry::disk_liquid(liquid_array.original_arrow_data_type()), - Some(bytes), - ) + SqueezeOutcome::Replace { + entry: CacheEntry::disk_liquid( + liquid_array.original_arrow_data_type(), + bytes.len(), + ), + bytes_to_write: Some(bytes), + } } CacheEntry::MemorySqueezedLiquid(squeezed_array) => { let data_type = squeezed_array.original_arrow_data_type(); let new_entry = match squeezed_array.disk_backing() { - SqueezedBacking::Liquid => CacheEntry::disk_liquid(data_type), - SqueezedBacking::Arrow => CacheEntry::disk_arrow(data_type), + SqueezedBacking::Liquid(n) => CacheEntry::disk_liquid(data_type, n), + SqueezedBacking::Arrow(n) => CacheEntry::disk_arrow(data_type, n), }; - (new_entry, None) + SqueezeOutcome::Replace { + entry: new_entry, + bytes_to_write: None, + } } - CacheEntry::DiskLiquid(_) | CacheEntry::DiskArrow(_) => (entry.clone(), None), + CacheEntry::DiskLiquid { .. } | CacheEntry::DiskArrow { .. } => SqueezeOutcome::Remove, } } } @@ -243,8 +280,12 @@ pub(crate) fn try_variant_squeeze( }; liquid_values.push((path, liquid_array)); } - let squeezed = - VariantStructSqueezedArray::new(liquid_values, nulls, backing_array.data_type().clone()); + let squeezed = VariantStructSqueezedArray::new( + liquid_values, + nulls, + backing_array.data_type().clone(), + bytes.len(), + ); Some((Arc::new(squeezed) as LiquidSqueezedArrayRef, bytes)) } @@ -324,6 +365,16 @@ mod tests { batch.column(0).clone() } + fn into_replace(outcome: SqueezeOutcome) -> (CacheEntry, Option) { + match outcome { + SqueezeOutcome::Replace { + entry, + bytes_to_write, + } => (entry, bytes_to_write), + SqueezeOutcome::Remove => panic!("expected replacement"), + } + } + fn struct_array() -> ArrayRef { let values = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as ArrayRef; let field = Arc::new(Field::new("value", DataType::Int32, true)); @@ -337,16 +388,23 @@ mod tests { let squeeze_io: Arc = Arc::new(TestSqueezeIo::default()); // MemoryArrow -> DiskArrow + bytes (Arrow IPC) let arr = int_array(8); - let (new_batch, bytes) = disk.squeeze( + let (new_batch, bytes) = into_replace(disk.squeeze( &CacheEntry::memory_arrow(arr.clone()), &states, None, &squeeze_io, - ); + )); let data = new_batch; match (data, bytes) { - (CacheEntry::DiskArrow(dt), Some(b)) => { + ( + CacheEntry::DiskArrow { + data_type: dt, + disk_bytes, + }, + Some(b), + ) => { assert_eq!(dt, DataType::Int32); + assert_eq!(disk_bytes, b.len()); let decoded = decode_arrow(&b); assert_eq!(decoded.as_ref(), arr.as_ref()); } @@ -356,15 +414,16 @@ mod tests { // MemoryLiquid (strings) -> MemoryHybridLiquid + bytes let strings = Arc::new(StringArray::from(vec!["a", "b", "a"])) as ArrayRef; let liquid = transcode_liquid_inner(&strings, &states).unwrap(); - let (new_batch, bytes) = disk.squeeze( + let (new_batch, bytes) = into_replace(disk.squeeze( &CacheEntry::memory_liquid(liquid.clone()), &states, None, &squeeze_io, - ); + )); let data = new_batch; match (data, bytes) { - (CacheEntry::DiskLiquid(_), Some(b)) => { + (CacheEntry::DiskLiquid { disk_bytes, .. }, Some(b)) => { + assert_eq!(disk_bytes, b.len()); assert!(!b.is_empty()); } other => panic!("unexpected: {other:?}"), @@ -376,33 +435,39 @@ mod tests { Some((h, _b)) => h, None => panic!("squeeze should succeed for byte-view"), }; - let (new_batch, bytes) = disk.squeeze( + let (new_batch, bytes) = into_replace(disk.squeeze( &CacheEntry::memory_squeezed_liquid(squeezed), &states, expression, &squeeze_io, - ); + )); let data = new_batch; match (data, bytes) { - (CacheEntry::DiskLiquid(_data_type), None) => {} + ( + CacheEntry::DiskLiquid { + data_type: _data_type, + .. + }, + None, + ) => {} other => panic!("unexpected: {other:?}"), } - // Disk* -> unchanged, no bytes - let (b1, w1) = disk.squeeze( - &CacheEntry::disk_arrow(DataType::Utf8), + // Disk* -> remove + let b1 = disk.squeeze( + &CacheEntry::disk_arrow(DataType::Utf8, 1), &states, expression, &squeeze_io, ); - assert!(matches!(b1, CacheEntry::DiskArrow(DataType::Utf8)) && w1.is_none()); - let (b2, w2) = disk.squeeze( - &CacheEntry::disk_liquid(DataType::Utf8), + assert!(matches!(b1, SqueezeOutcome::Remove)); + let b2 = disk.squeeze( + &CacheEntry::disk_liquid(DataType::Utf8, 1), &states, expression, &squeeze_io, ); - assert!(matches!(b2, CacheEntry::DiskLiquid(DataType::Utf8)) && w2.is_none()); + assert!(matches!(b2, SqueezeOutcome::Remove)); } #[test] @@ -413,12 +478,12 @@ mod tests { // MemoryArrow -> MemoryLiquid, no bytes let arr = int_array(8); - let (new_batch, bytes) = to_liquid.squeeze( + let (new_batch, bytes) = into_replace(to_liquid.squeeze( &CacheEntry::memory_arrow(arr.clone()), &states, None, &squeeze_io, - ); + )); assert!(bytes.is_none()); match new_batch { CacheEntry::MemoryLiquid(liq) => { @@ -431,12 +496,12 @@ mod tests { // MemoryLiquid (strings) -> MemorySqueezedLiquid + bytes let strings = Arc::new(StringArray::from(vec!["x", "y", "x"])) as ArrayRef; let liquid = transcode_liquid_inner(&strings, &states).unwrap(); - let (new_batch, bytes) = to_liquid.squeeze( + let (new_batch, bytes) = into_replace(to_liquid.squeeze( &CacheEntry::memory_liquid(liquid), &states, expression, &squeeze_io, - ); + )); match (new_batch, bytes) { (CacheEntry::MemorySqueezedLiquid(_), Some(b)) => assert!(!b.is_empty()), other => panic!("unexpected: {other:?}"), @@ -446,32 +511,38 @@ mod tests { let strings = Arc::new(StringArray::from(vec!["m", "n"])) as ArrayRef; let liquid = transcode_liquid_inner(&strings, &states).unwrap(); let squeezed = liquid.squeeze(squeeze_io.clone(), expression).unwrap().0; - let (new_batch, bytes) = to_liquid.squeeze( + let (new_batch, bytes) = into_replace(to_liquid.squeeze( &CacheEntry::memory_squeezed_liquid(squeezed), &states, expression, &squeeze_io, - ); + )); match (new_batch, bytes) { - (CacheEntry::DiskLiquid(DataType::Utf8), None) => {} + ( + CacheEntry::DiskLiquid { + data_type: DataType::Utf8, + .. + }, + None, + ) => {} other => panic!("unexpected: {other:?}"), } - // Disk* -> unchanged - let (b1, w1) = to_liquid.squeeze( - &CacheEntry::disk_arrow(DataType::Utf8), + // Disk* -> remove + let b1 = to_liquid.squeeze( + &CacheEntry::disk_arrow(DataType::Utf8, 1), &states, expression, &squeeze_io, ); - assert!(matches!(b1, CacheEntry::DiskArrow(DataType::Utf8)) && w1.is_none()); - let (b2, w2) = to_liquid.squeeze( - &CacheEntry::disk_liquid(DataType::Utf8), + assert!(matches!(b1, SqueezeOutcome::Remove)); + let b2 = to_liquid.squeeze( + &CacheEntry::disk_liquid(DataType::Utf8, 1), &states, expression, &squeeze_io, ); - assert!(matches!(b2, CacheEntry::DiskLiquid(DataType::Utf8)) && w2.is_none()); + assert!(matches!(b2, SqueezeOutcome::Remove)); } #[test] @@ -480,15 +551,22 @@ mod tests { let states = LiquidCompressorStates::new(); let squeeze_io: Arc = Arc::new(TestSqueezeIo::default()); let struct_arr = struct_array(); - let (new_batch, bytes) = to_liquid.squeeze( + let (new_batch, bytes) = into_replace(to_liquid.squeeze( &CacheEntry::memory_arrow(struct_arr.clone()), &states, None, &squeeze_io, - ); + )); match (new_batch, bytes) { - (CacheEntry::DiskArrow(dt), Some(b)) => { + ( + CacheEntry::DiskArrow { + data_type: dt, + disk_bytes, + }, + Some(b), + ) => { assert_eq!(&dt, struct_arr.data_type()); + assert_eq!(disk_bytes, b.len()); assert_eq!(decode_arrow(&b).as_ref(), struct_arr.as_ref()); } other => panic!("expected disk arrow fallback, got {other:?}"), @@ -501,15 +579,22 @@ mod tests { let states = LiquidCompressorStates::new(); let squeeze_io: Arc = Arc::new(TestSqueezeIo::default()); let struct_arr = struct_array(); - let (new_batch, bytes) = to_disk.squeeze( + let (new_batch, bytes) = into_replace(to_disk.squeeze( &CacheEntry::memory_arrow(struct_arr.clone()), &states, None, &squeeze_io, - ); + )); match (new_batch, bytes) { - (CacheEntry::DiskArrow(dt), Some(b)) => { + ( + CacheEntry::DiskArrow { + data_type: dt, + disk_bytes, + }, + Some(b), + ) => { assert_eq!(&dt, struct_arr.data_type()); + assert_eq!(disk_bytes, b.len()); assert_eq!(decode_arrow(&b).as_ref(), struct_arr.as_ref()); } other => panic!("expected disk arrow fallback, got {other:?}"), @@ -609,7 +694,7 @@ mod tests { use futures::executor::block_on; assert!(!bytes.is_empty()); - assert_eq!(squeezed.disk_backing(), SqueezedBacking::Arrow); + assert!(matches!(squeezed.disk_backing(), SqueezedBacking::Arrow(_))); let struct_squeezed = squeezed .as_any() .downcast_ref::() @@ -643,12 +728,12 @@ mod tests { let hint = CacheExpression::variant_get("name", DataType::Utf8); let squeeze_io: Arc = Arc::new(TestSqueezeIo::default()); - let (new_batch, bytes) = policy.squeeze( + let (new_batch, bytes) = into_replace(policy.squeeze( &CacheEntry::memory_arrow(variant_arr), &states, Some(&hint), &squeeze_io, - ); + )); match (new_batch, bytes) { (CacheEntry::MemorySqueezedLiquid(squeezed), Some(b)) => { @@ -666,12 +751,12 @@ mod tests { let hint = CacheExpression::variant_get("age", DataType::Int64); let squeeze_io: Arc = Arc::new(TestSqueezeIo::default()); - let (new_batch, bytes) = policy.squeeze( + let (new_batch, bytes) = into_replace(policy.squeeze( &CacheEntry::memory_arrow(variant_arr), &states, Some(&hint), &squeeze_io, - ); + )); match (new_batch, bytes) { (CacheEntry::MemorySqueezedLiquid(squeezed), Some(b)) => { @@ -692,12 +777,12 @@ mod tests { let hint = CacheExpression::variant_get("name", DataType::Utf8); let squeeze_io: Arc = Arc::new(TestSqueezeIo::default()); - let (new_batch, bytes) = policy.squeeze( + let (new_batch, bytes) = into_replace(policy.squeeze( &CacheEntry::memory_arrow(variant_arr), &states, Some(&hint), &squeeze_io, - ); + )); match (new_batch, bytes) { (CacheEntry::MemorySqueezedLiquid(squeezed), Some(b)) => { @@ -728,15 +813,18 @@ mod tests { let variant_arr = enriched_variant_array("name", DataType::Utf8); let squeeze_io: Arc = Arc::new(TestSqueezeIo::default()); - let (new_batch, bytes) = policy.squeeze( + let (new_batch, bytes) = into_replace(policy.squeeze( &CacheEntry::memory_arrow(variant_arr), &states, None, &squeeze_io, - ); + )); match (new_batch, bytes) { - (CacheEntry::DiskArrow(_), Some(b)) => assert!(!b.is_empty()), + (CacheEntry::DiskArrow { disk_bytes, .. }, Some(b)) => { + assert_eq!(disk_bytes, b.len()); + assert!(!b.is_empty()); + } (CacheEntry::MemoryLiquid(_), None) => {} other => panic!("expected DiskArrow with bytes or MemoryLiquid, got {other:?}"), } @@ -750,16 +838,23 @@ mod tests { let variant_arr = enriched_variant_array("name", DataType::Utf8); let hint = CacheExpression::variant_get("age", DataType::Int64); - let (new_batch, bytes) = policy.squeeze( + let (new_batch, bytes) = into_replace(policy.squeeze( &CacheEntry::memory_arrow(variant_arr.clone()), &states, Some(&hint), &squeeze_io, - ); + )); match (new_batch, bytes) { - (CacheEntry::DiskArrow(dt), Some(b)) => { + ( + CacheEntry::DiskArrow { + data_type: dt, + disk_bytes, + }, + Some(b), + ) => { assert_eq!(dt, variant_arr.data_type().clone()); + assert_eq!(disk_bytes, b.len()); assert!(!b.is_empty()); } other => panic!("expected DiskArrow fallback when path missing, got {other:?}"), diff --git a/src/core/src/cache/tests/policies.rs b/src/core/src/cache/tests/policies.rs index b0aba179..29c84990 100644 --- a/src/core/src/cache/tests/policies.rs +++ b/src/core/src/cache/tests/policies.rs @@ -12,13 +12,13 @@ async fn default_policies() { .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(AlwaysHydrate::new())) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) - .with_max_cache_bytes(capacity) + .with_max_memory_bytes(capacity) .build() .await; for i in 0..5 { let entry_id = EntryID::from(i); - cache.insert(entry_id, test_array.clone()).await; + cache.insert(entry_id, test_array.clone()).await.unwrap(); } for i in 0..5 { @@ -40,14 +40,20 @@ async fn insert_wont_fit_cache() { .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(AlwaysHydrate::new())) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) - .with_max_cache_bytes(capacity) + .with_max_memory_bytes(capacity) .build() .await; - cache.insert(EntryID::from(0), test_array.clone()).await; + cache + .insert(EntryID::from(0), test_array.clone()) + .await + .unwrap(); let array_3x = arrow::compute::concat(&[&test_array, &test_array, &test_array]).unwrap(); let array_9x = arrow::compute::concat(&[&array_3x, &array_3x, &array_3x]).unwrap(); let array_27x = arrow::compute::concat(&[&array_9x, &array_9x, &array_9x]).unwrap(); - cache.insert(EntryID::from(1), array_27x.clone()).await; + cache + .insert(EntryID::from(1), array_27x.clone()) + .await + .unwrap(); cache.get(&EntryID::from(1)).read().await.unwrap(); let trace = cache.consume_event_trace(); diff --git a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__policies__default_policies.snap b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__policies__default_policies.snap index b53d1069..83ab4e86 100644 --- a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__policies__default_policies.snap +++ b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__policies__default_policies.snap @@ -18,9 +18,9 @@ event=squeeze_victim entry=2 event=insert_success entry=2 kind=MemoryLiquid event=squeeze_victim entry=0 event=io_write entry=0 kind=DiskLiquid bytes=1320 +event=insert_success entry=0 kind=DiskLiquid event=squeeze_victim entry=1 event=io_write entry=1 kind=DiskLiquid bytes=1320 -event=insert_success entry=0 kind=DiskLiquid event=insert_success entry=1 kind=DiskLiquid event=insert_success entry=3 kind=MemoryArrow event=insert_failed entry=4 kind=MemoryArrow diff --git a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_date_time.snap b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_date_time.snap index de26da71..1274941f 100644 --- a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_date_time.snap +++ b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_date_time.snap @@ -18,9 +18,9 @@ event=squeeze_victim entry=2 event=insert_success entry=2 kind=MemoryLiquid event=squeeze_victim entry=0 event=io_write entry=0 kind=MemorySqueezedLiquid bytes=6184 +event=insert_success entry=0 kind=MemorySqueezedLiquid event=squeeze_victim entry=1 event=io_write entry=1 kind=MemorySqueezedLiquid bytes=6184 -event=insert_success entry=0 kind=MemorySqueezedLiquid event=insert_success entry=1 kind=MemorySqueezedLiquid event=insert_success entry=3 kind=MemoryArrow event=read entry=0 selection=false expr=ExtractDate32:Year cached=MemorySqueezedLiquid diff --git a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_int64_array.snap b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_int64_array.snap index b0f67dbf..d0ae4ddb 100644 --- a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_int64_array.snap +++ b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_int64_array.snap @@ -18,9 +18,9 @@ event=squeeze_victim entry=2 event=insert_success entry=2 kind=MemoryLiquid event=squeeze_victim entry=0 event=io_write entry=0 kind=MemorySqueezedLiquid bytes=6184 +event=insert_success entry=0 kind=MemorySqueezedLiquid event=squeeze_victim entry=1 event=io_write entry=1 kind=DiskLiquid bytes=6184 -event=insert_success entry=0 kind=MemorySqueezedLiquid event=insert_success entry=1 kind=DiskLiquid event=insert_success entry=3 kind=MemoryArrow event=read entry=0 selection=false expr=PredicateColumn cached=MemorySqueezedLiquid diff --git a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_variant_path.snap b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_variant_path.snap index 3ce06825..18756a3b 100644 --- a/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_variant_path.snap +++ b/src/core/src/cache/tests/snapshots/liquid_cache__cache__tests__squeezed__read_squeezed_variant_path.snap @@ -14,9 +14,9 @@ event=insert_failed entry=2 kind=MemoryArrow event=squeeze_begin victims=[1,0] event=squeeze_victim entry=1 event=io_write entry=1 kind=MemorySqueezedLiquid bytes=7816 +event=insert_success entry=1 kind=MemorySqueezedLiquid event=squeeze_victim entry=0 event=insert_success entry=0 kind=DiskArrow -event=insert_success entry=1 kind=MemorySqueezedLiquid event=insert_success entry=2 kind=MemoryArrow event=read entry=0 selection=false expr=VariantGet[name:Utf8] cached=DiskArrow event=io_read_arrow entry=0 bytes=7816 diff --git a/src/core/src/cache/tests/squeezed.rs b/src/core/src/cache/tests/squeezed.rs index b14c6800..ec46a450 100644 --- a/src/core/src/cache/tests/squeezed.rs +++ b/src/core/src/cache/tests/squeezed.rs @@ -6,8 +6,8 @@ use parquet_variant_compute::json_to_variant; use crate::{ cache::{ - AlwaysHydrate, CacheExpression, DefaultIoContext, EntryID, LiquidCacheBuilder, - LiquidPolicy, TranscodeSqueezeEvict, + AlwaysHydrate, CacheExpression, EntryID, LiquidCacheBuilder, LiquidPolicy, + TranscodeSqueezeEvict, }, liquid_array::Date32Field, }; @@ -27,12 +27,12 @@ async fn read_squeezed_date_time() { .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(AlwaysHydrate::new())) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) - .with_max_cache_bytes(array_size * 2) - .with_io_context(Arc::new(DefaultIoContext::new( + .with_max_memory_bytes(array_size * 2) + .with_store( t4::mount(temp_dir.path().join("liquid_cache.t4")) .await .unwrap(), - ))) + ) .build() .await; @@ -43,7 +43,8 @@ async fn read_squeezed_date_time() { cache .insert(entry_id, array.clone()) .with_squeeze_hint(expression.clone()) - .await; + .await + .unwrap(); } for i in 0..4 { @@ -92,12 +93,12 @@ async fn read_squeezed_variant_path() { .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(AlwaysHydrate::new())) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) - .with_max_cache_bytes(array_size * 3 / 2) - .with_io_context(Arc::new(DefaultIoContext::new( + .with_max_memory_bytes(array_size * 3 / 2) + .with_store( t4::mount(temp_dir.path().join("liquid_cache.t4")) .await .unwrap(), - ))) + ) .build() .await; @@ -112,7 +113,8 @@ async fn read_squeezed_variant_path() { cache .insert(entry_id, variant_array.clone()) .with_squeeze_hint(name_expr.clone()) - .await; + .await + .unwrap(); } let squeezed = cache @@ -154,12 +156,12 @@ async fn read_squeezed_int64_array() { .with_cache_policy(Box::new(LiquidPolicy::new())) .with_hydration_policy(Box::new(AlwaysHydrate::new())) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) - .with_max_cache_bytes(array_size * 2) - .with_io_context(Arc::new(DefaultIoContext::new( + .with_max_memory_bytes(array_size * 2) + .with_store( t4::mount(temp_dir.path().join("liquid_cache.t4")) .await .unwrap(), - ))) + ) .build() .await; @@ -171,9 +173,10 @@ async fn read_squeezed_int64_array() { cache .insert(entry_id, int64_array.clone()) .with_squeeze_hint(expression.clone()) - .await; + .await + .unwrap(); } else { - cache.insert(entry_id, int64_array.clone()).await; + cache.insert(entry_id, int64_array.clone()).await.unwrap(); } } diff --git a/src/core/src/cache/utils.rs b/src/core/src/cache/utils.rs index a430db48..a4d25343 100644 --- a/src/core/src/cache/utils.rs +++ b/src/core/src/cache/utils.rs @@ -8,14 +8,16 @@ use bytes::Bytes; #[derive(Debug)] pub struct CacheConfig { batch_size: usize, - max_cache_bytes: usize, + max_memory_bytes: usize, + max_disk_bytes: usize, } impl CacheConfig { - pub(super) fn new(batch_size: usize, max_cache_bytes: usize) -> Self { + pub(super) fn new(batch_size: usize, max_memory_bytes: usize, max_disk_bytes: usize) -> Self { Self { batch_size, - max_cache_bytes, + max_memory_bytes, + max_disk_bytes, } } @@ -23,8 +25,12 @@ impl CacheConfig { self.batch_size } - pub fn max_cache_bytes(&self) -> usize { - self.max_cache_bytes + pub fn max_memory_bytes(&self) -> usize { + self.max_memory_bytes + } + + pub fn max_disk_bytes(&self) -> usize { + self.max_disk_bytes } } @@ -46,7 +52,7 @@ pub(crate) fn create_test_arrow_array(size: usize) -> ArrayRef { #[cfg(test)] pub(crate) async fn create_cache_store( - max_cache_bytes: usize, + max_memory_bytes: usize, policy: Box, ) -> Arc { use crate::cache::{AlwaysHydrate, LiquidCacheBuilder, TranscodeSqueezeEvict}; @@ -55,7 +61,7 @@ pub(crate) async fn create_cache_store( let builder = LiquidCacheBuilder::new() .with_batch_size(batch_size) - .with_max_cache_bytes(max_cache_bytes) + .with_max_memory_bytes(max_memory_bytes) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_hydration_policy(Box::new(AlwaysHydrate::new())) .with_cache_policy(policy); diff --git a/src/core/src/liquid_array/byte_view_array/mod.rs b/src/core/src/liquid_array/byte_view_array/mod.rs index b5e78b0e..1d484148 100644 --- a/src/core/src/liquid_array/byte_view_array/mod.rs +++ b/src/core/src/liquid_array/byte_view_array/mod.rs @@ -20,7 +20,7 @@ use crate::liquid_array::raw::FsstArray; use crate::liquid_array::raw::fsst_buffer::{DiskBuffer, FsstBacking, PrefixKey}; use crate::liquid_array::{ LiquidArray, LiquidDataType, LiquidSqueezedArray, LiquidSqueezedArrayRef, SqueezeIoHandler, - eval_predicate_on_array, + SqueezedBacking, eval_predicate_on_array, }; mod comparisons; @@ -468,6 +468,10 @@ impl LiquidSqueezedArray for LiquidByteViewArray { self.original_arrow_type.to_arrow_type() } + fn disk_backing(&self) -> SqueezedBacking { + SqueezedBacking::Liquid(self.fsst_buffer.disk_range_len()) + } + /// Filter the Liquid array with a boolean array and return an **arrow array**. async fn filter(&self, selection: &BooleanBuffer) -> ArrayRef { let select_any = selection.count_set_bits() > 0; diff --git a/src/core/src/liquid_array/decimal_array.rs b/src/core/src/liquid_array/decimal_array.rs index 354f564e..050a746e 100644 --- a/src/core/src/liquid_array/decimal_array.rs +++ b/src/core/src/liquid_array/decimal_array.rs @@ -20,7 +20,7 @@ use num_traits::ToPrimitive; use super::{ LiquidArray, LiquidDataType, LiquidSqueezedArray, LiquidSqueezedArrayRef, NeedsBacking, - Operator, SqueezeIoHandler, SqueezeResult, + Operator, SqueezeIoHandler, SqueezeResult, SqueezedBacking, }; use crate::cache::{CacheExpression, LiquidExpr}; use crate::liquid_array::eval_predicate_on_array; @@ -537,6 +537,10 @@ impl LiquidSqueezedArray for LiquidDecimalQuantizedArray { self.meta.data_type() } + fn disk_backing(&self) -> SqueezedBacking { + SqueezedBacking::Liquid((self.disk_range.end - self.disk_range.start) as usize) + } + async fn try_eval_predicate( &self, liquid_expr: &LiquidExpr, diff --git a/src/core/src/liquid_array/float_array.rs b/src/core/src/liquid_array/float_array.rs index b3985aa8..f6dea20e 100644 --- a/src/core/src/liquid_array/float_array.rs +++ b/src/core/src/liquid_array/float_array.rs @@ -36,7 +36,7 @@ use crate::liquid_array::ipc::{PhysicalTypeMarker, get_physical_type_id}; use crate::liquid_array::raw::BitPackedArray; use crate::liquid_array::{ LiquidSqueezedArray, LiquidSqueezedArrayRef, NeedsBacking, Operator, SqueezeResult, - eval_predicate_on_array, ipc::LiquidIPCHeader, + SqueezedBacking, eval_predicate_on_array, ipc::LiquidIPCHeader, }; use crate::utils::get_bit_width; use crate::{cache::CacheExpression, liquid_array::SqueezeIoHandler}; @@ -985,6 +985,10 @@ where T::DATA_TYPE.clone() } + fn disk_backing(&self) -> SqueezedBacking { + SqueezedBacking::Liquid((self.disk_range.end - self.disk_range.start) as usize) + } + async fn try_eval_predicate( &self, liquid_expr: &LiquidExpr, diff --git a/src/core/src/liquid_array/hybrid_primitive_array.rs b/src/core/src/liquid_array/hybrid_primitive_array.rs index 15dcb5cb..7f05f934 100644 --- a/src/core/src/liquid_array/hybrid_primitive_array.rs +++ b/src/core/src/liquid_array/hybrid_primitive_array.rs @@ -23,7 +23,7 @@ use crate::liquid_array::raw::BitPackedArray; use super::primitive_array::LiquidPrimitiveType; use super::{ LiquidDataType, LiquidSqueezedArray, NeedsBacking, Operator, PrimitiveKind, SqueezeIoHandler, - SqueezeResult, + SqueezeResult, SqueezedBacking, }; #[derive(Clone, Copy)] @@ -317,6 +317,10 @@ where T::DATA_TYPE.clone() } + fn disk_backing(&self) -> SqueezedBacking { + SqueezedBacking::Liquid((self.disk_range.end - self.disk_range.start) as usize) + } + async fn filter(&self, selection: &BooleanBuffer) -> ArrayRef { if selection.count_set_bits() == 0 { return arrow::array::new_empty_array(&self.original_arrow_data_type()); @@ -689,6 +693,10 @@ where T::DATA_TYPE.clone() } + fn disk_backing(&self) -> SqueezedBacking { + SqueezedBacking::Liquid((self.disk_range.end - self.disk_range.start) as usize) + } + async fn try_eval_predicate( &self, liquid_expr: &LiquidExpr, diff --git a/src/core/src/liquid_array/mod.rs b/src/core/src/liquid_array/mod.rs index 22b07cb0..7776c764 100644 --- a/src/core/src/liquid_array/mod.rs +++ b/src/core/src/liquid_array/mod.rs @@ -148,13 +148,25 @@ pub trait LiquidArray: std::fmt::Debug + Send + Sync { /// A reference to a Liquid array. pub type LiquidArrayRef = Arc; -/// On-disk backing type for a hybrid array. +/// On-disk backing for a squeezed array. +/// +/// Each variant carries the byte length of the persisted backing data, so the +/// cache can release the disk budget when the entry is evicted. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SqueezedBacking { /// Bytes are stored using the Liquid IPC format. - Liquid, + Liquid(usize), /// Bytes are stored using Arrow IPC (or another Arrow-compatible encoding). - Arrow, + Arrow(usize), +} + +impl SqueezedBacking { + /// Byte length of the backing data persisted on disk. + pub fn disk_bytes(&self) -> usize { + match self { + Self::Liquid(n) | Self::Arrow(n) => *n, + } + } } /// A reference to a Liquid squeezed array. @@ -245,10 +257,9 @@ pub trait LiquidSqueezedArray: std::fmt::Debug + Send + Sync { eval_predicate_on_array(filtered, predicate) } - /// Describe how the squeezed array persists its backing bytes on disk. - fn disk_backing(&self) -> SqueezedBacking { - SqueezedBacking::Liquid - } + /// Describe how the squeezed array persists its backing bytes on disk, + /// including the byte length of the persisted data. + fn disk_backing(&self) -> SqueezedBacking; } pub(crate) fn eval_predicate_on_array(array: ArrayRef, predicate: &LiquidExpr) -> BooleanArray { diff --git a/src/core/src/liquid_array/raw/fsst_buffer.rs b/src/core/src/liquid_array/raw/fsst_buffer.rs index f518cdae..4e64b6f4 100644 --- a/src/core/src/liquid_array/raw/fsst_buffer.rs +++ b/src/core/src/liquid_array/raw/fsst_buffer.rs @@ -718,6 +718,10 @@ impl DiskBuffer { self.disk_range.clone() } + pub(crate) fn disk_range_len(&self) -> usize { + (self.disk_range.end - self.disk_range.start) as usize + } + pub(crate) fn compressor_arc(&self) -> Arc { self.compressor.clone() } diff --git a/src/core/src/liquid_array/squeezed_date32_array.rs b/src/core/src/liquid_array/squeezed_date32_array.rs index 67e2b93a..aa3aa01d 100644 --- a/src/core/src/liquid_array/squeezed_date32_array.rs +++ b/src/core/src/liquid_array/squeezed_date32_array.rs @@ -16,7 +16,7 @@ use std::sync::Arc; use super::LiquidArray; use super::primitive_array::LiquidPrimitiveArray; -use super::{LiquidDataType, LiquidSqueezedArray}; +use super::{LiquidDataType, LiquidSqueezedArray, SqueezedBacking}; use crate::cache::LiquidExpr; use crate::liquid_array::LiquidPrimitiveType; use crate::liquid_array::SqueezeIoHandler; @@ -49,11 +49,11 @@ pub struct SqueezedDate32Array { /// The minimum extracted value used as reference for offsetting. reference_value: i32, original_data_type: DataType, - backing: Option, + backing: Option, } #[derive(Debug, Clone)] -struct SqueezedBacking { +struct DiskBacking { io: Arc, disk_range: Range, } @@ -225,7 +225,7 @@ impl SqueezedDate32Array { io: Arc, disk_range: Range, ) -> Self { - self.backing = Some(SqueezedBacking { io, disk_range }); + self.backing = Some(DiskBacking { io, disk_range }); self } @@ -261,12 +261,14 @@ impl SqueezedDate32Array { self.field } - /// Convert to an Arrow array holding the extracted component. + /// Convert to an Arrow array shaped like the original input, encoded so that + /// re-applying `date_part` (or any equivalent extraction) recovers the + /// component value originally squeezed. pub fn to_component_array(&self) -> ArrayRef { match &self.original_data_type { - DataType::Date32 => Arc::new(self.to_component_date32()) as ArrayRef, - DataType::Timestamp(unit, _) => self.to_component_timestamp(*unit), - _ => Arc::new(self.to_component_date32()) as ArrayRef, + DataType::Date32 => Arc::new(self.to_arrow_date32_lossy()) as ArrayRef, + DataType::Timestamp(unit, _) => self.to_arrow_timestamp_lossy(*unit), + _ => Arc::new(self.to_arrow_date32_lossy()) as ArrayRef, } } @@ -281,28 +283,35 @@ impl SqueezedDate32Array { PrimitiveArray::::new(signed_values, nulls) } - fn to_component_timestamp(&self, unit: TimeUnit) -> ArrayRef { - let unsigned: PrimitiveArray = self.bit_packed.to_primitive(); - let (_dt, values, nulls) = unsigned.into_parts(); - let ref_v = self.reference_value; - let signed_values: ScalarBuffer = - ScalarBuffer::from_iter(values.iter().map(|&v| (v as i32 + ref_v) as i64)); - + /// Lossy reconstruction to Arrow Timestamp at the requested unit, using the + /// same date mapping as [`Self::to_arrow_date32_lossy`] (midnight UTC of the + /// reconstructed date). + pub fn to_arrow_timestamp_lossy(&self, unit: TimeUnit) -> ArrayRef { + let date_array = self.to_arrow_date32_lossy(); + let (_dt, day_values, nulls) = date_array.into_parts(); + let ticks_per_day: i64 = match unit { + TimeUnit::Second => 86_400, + TimeUnit::Millisecond => 86_400_000, + TimeUnit::Microsecond => 86_400_000_000, + TimeUnit::Nanosecond => 86_400_000_000_000, + }; + let tick_values: ScalarBuffer = + ScalarBuffer::from_iter(day_values.iter().map(|&d| (d as i64) * ticks_per_day)); match unit { TimeUnit::Second => Arc::new(PrimitiveArray::::new( - signed_values, + tick_values, nulls, )), TimeUnit::Millisecond => Arc::new(PrimitiveArray::::new( - signed_values.clone(), + tick_values, nulls, )), TimeUnit::Microsecond => Arc::new(PrimitiveArray::::new( - signed_values.clone(), + tick_values, nulls, )), TimeUnit::Nanosecond => Arc::new(PrimitiveArray::::new( - signed_values, + tick_values, nulls, )), } @@ -450,6 +459,14 @@ impl LiquidSqueezedArray for SqueezedDate32Array { self.original_data_type.clone() } + fn disk_backing(&self) -> SqueezedBacking { + let backing = self + .backing + .as_ref() + .expect("SqueezedDate32Array backing not set"); + SqueezedBacking::Liquid((backing.disk_range.end - backing.disk_range.start) as usize) + } + async fn filter(&self, selection: &BooleanBuffer) -> ArrayRef { if selection.count_set_bits() == 0 { return arrow::array::new_empty_array(&self.original_arrow_data_type()); @@ -645,8 +662,56 @@ mod tests { } } + /// `to_component_array` is consumed by [`crate::cache::core::LiquidCache::try_read_squeezed_date32_array`] + /// as the SQL fast path. The query plan still runs `date_part` on the returned array, so the + /// values must round-trip through `component_from_days`: feeding a returned Date32 day-value + /// back into `component_from_days(field, days)` must recover the original component. + /// + /// Before the encoding fix, the Year case returned `Date32(year_int)` (e.g. year 1970 became + /// Date32 day-1970 = 1975-05-24), so re-extracting the year gave 1975 instead of 1970. + #[test] + fn to_component_array_date32_round_trips_through_extract() { + let inputs: Vec> = vec![ + Some(ymd_to_epoch_days(1970, 1, 1)), + Some(ymd_to_epoch_days(1971, 7, 15)), + Some(ymd_to_epoch_days(1999, 12, 31)), + Some(ymd_to_epoch_days(2024, 2, 29)), + Some(ymd_to_epoch_days(4709, 11, 24)), + None, + ]; + let expected_components: Vec> = inputs + .iter() + .map(|opt| opt.map(|d| component_from_days(Date32Field::Year, d))) + .collect(); + + let arr = dates(&inputs); + let liquid = LiquidPrimitiveArray::::from_arrow_array(arr); + let squeezed = SqueezedDate32Array::from_liquid_date32(&liquid, Date32Field::Year); + let component = squeezed + .to_component_array() + .as_any() + .downcast_ref::>() + .expect("date32 component array") + .clone(); + + for (idx, expected) in expected_components.iter().enumerate() { + match expected { + Some(year) => { + assert!(!component.is_null(idx), "row {idx} unexpectedly null"); + let recovered = component_from_days(Date32Field::Year, component.value(idx)); + assert_eq!( + recovered, *year, + "row {idx}: extracting Year from to_component_array output recovered {recovered}, expected {year}", + ); + } + None => assert!(component.is_null(idx), "row {idx} should be null"), + } + } + } + #[test] fn test_timestamp_extraction() { + // Two Microsecond timestamps at 2021-01-01 00:00:00 UTC and 2022-01-01 00:00:00 UTC. let input = vec![ Some(1_609_459_200_000_000), Some(1_640_995_200_000_000), @@ -661,8 +726,23 @@ mod tests { .downcast_ref::>() .expect("timestamp array"); - assert_eq!(out.value(0), 2021); - assert_eq!(out.value(1), 2022); + // to_component_array returns Timestamps that round-trip through `date_part`: + // year 2021 maps to (2021,1,1) at midnight UTC. + let micros_per_day: i64 = 86_400_000_000; + assert_eq!( + out.value(0), + ymd_to_epoch_days(2021, 1, 1) as i64 * micros_per_day, + ); + assert_eq!( + out.value(1), + ymd_to_epoch_days(2022, 1, 1) as i64 * micros_per_day, + ); assert!(out.is_null(2)); + + // Direct integer view is still available via `to_component_date32`. + let int_view = squeezed.to_component_date32(); + assert_eq!(int_view.value(0), 2021); + assert_eq!(int_view.value(1), 2022); + assert!(int_view.is_null(2)); } } diff --git a/src/core/src/liquid_array/variant_array.rs b/src/core/src/liquid_array/variant_array.rs index 4b4e122d..aaae6f38 100644 --- a/src/core/src/liquid_array/variant_array.rs +++ b/src/core/src/liquid_array/variant_array.rs @@ -16,6 +16,7 @@ pub struct VariantStructSqueezedArray { len: usize, nulls: Option, original_arrow_type: DataType, + disk_backing_size: usize, } impl VariantStructSqueezedArray { @@ -24,6 +25,7 @@ impl VariantStructSqueezedArray { values: Vec<(Arc, LiquidArrayRef)>, nulls: Option, original_arrow_type: DataType, + disk_backing_size: usize, ) -> Self { let len = values.first().map(|(_, array)| array.len()).unwrap_or(0); let mut map = AHashMap::with_capacity(values.len()); @@ -36,6 +38,7 @@ impl VariantStructSqueezedArray { len, nulls, original_arrow_type, + disk_backing_size, } } @@ -100,6 +103,7 @@ impl VariantStructSqueezedArray { filtered, self.nulls.clone(), self.original_arrow_type.clone(), + self.disk_backing_size, ); Ok(Arc::new(filtered.build_root_struct()) as ArrayRef) } @@ -148,7 +152,7 @@ impl LiquidSqueezedArray for VariantStructSqueezedArray { } fn disk_backing(&self) -> SqueezedBacking { - SqueezedBacking::Arrow + SqueezedBacking::Arrow(self.disk_backing_size) } } @@ -248,6 +252,7 @@ mod tests { ], None, DataType::Struct(Fields::from(Vec::>::new())), + 0, ); // Request only time_us; did should be pruned from typed_value. diff --git a/src/core/study/cache_storage.rs b/src/core/study/cache_storage.rs index d9a819d3..0c69f448 100644 --- a/src/core/study/cache_storage.rs +++ b/src/core/study/cache_storage.rs @@ -10,13 +10,12 @@ use datafusion::logical_expr::Operator; use datafusion::prelude::*; use datafusion::scalar::ScalarValue; use futures::StreamExt; -use liquid_cache::cache::DefaultIoContext; use liquid_cache::cache::EntryID; use liquid_cache::cache::LiquidCache; use liquid_cache::cache::LiquidCacheBuilder; use liquid_cache::cache::LiquidExpr; +use liquid_cache::cache::LiquidPolicy; use liquid_cache::cache::squeeze_policies::TranscodeSqueezeEvict; -use liquid_cache::cache_policies::FiloPolicy; #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; @@ -48,13 +47,12 @@ fn main() { .unwrap_or_else(|| tempfile::tempdir().unwrap().keep()); let store_path = cache_dir.join("liquid_cache.t4"); let store = tokio_test::block_on(t4::mount(&store_path)).expect("failed to mount t4 store"); - let io_context = Arc::new(DefaultIoContext::new(store)); let storage = tokio_test::block_on(async { LiquidCacheBuilder::new() - .with_max_cache_bytes(500 * 1024 * 1024) + .with_max_memory_bytes(500 * 1024 * 1024) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) - .with_cache_policy(Box::new(FiloPolicy::new())) - .with_io_context(io_context) + .with_cache_policy(Box::new(LiquidPolicy::new())) + .with_store(store) .build() .await }); @@ -146,7 +144,7 @@ fn load_and_insert_referer( let id = EntryID::from(idx); ids.push(id); total_size += array.get_array_memory_size(); - storage.insert(id, array).await; + storage.insert(id, array).await.unwrap(); idx += 1; } diff --git a/src/datafusion-local/README.md b/src/datafusion-local/README.md index 142049b0..db2df15f 100644 --- a/src/datafusion-local/README.md +++ b/src/datafusion-local/README.md @@ -8,7 +8,7 @@ This crate provides an in-process version of LiquidCache that doesn't require a ```rust use liquid_cache_datafusion_local::{ - storage::cache_policies::FiloPolicy, + storage::cache_policies::LiquidPolicy, LiquidCacheLocalBuilder, }; use datafusion::prelude::SessionConfig; @@ -19,9 +19,9 @@ async fn main() -> Result<(), Box> { let temp_dir = TempDir::new().unwrap(); let (ctx, _cache) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 1024 * 1024) // 1GB + .with_max_memory_bytes(1024 * 1024 * 1024) // 1GB .with_cache_dir(temp_dir.path().to_path_buf()) - .with_cache_policy(Box::new(FiloPolicy::new())) + .with_cache_policy(Box::new(LiquidPolicy::new())) .build(SessionConfig::new()) .await?; diff --git a/src/datafusion-local/src/lib.rs b/src/datafusion-local/src/lib.rs index 678ae75e..81a259aa 100644 --- a/src/datafusion-local/src/lib.rs +++ b/src/datafusion-local/src/lib.rs @@ -12,8 +12,7 @@ use datafusion::logical_expr::ScalarUDF; use datafusion::prelude::{SessionConfig, SessionContext}; use liquid_cache::cache::squeeze_policies::{SqueezePolicy, TranscodeSqueezeEvict}; use liquid_cache::cache::{AlwaysHydrate, HydrationPolicy}; -use liquid_cache::cache_policies::CachePolicy; -use liquid_cache::cache_policies::LiquidPolicy; +use liquid_cache::cache_policies::{CachePolicy, LiquidPolicy}; use liquid_cache_datafusion::optimizers::{LineageOptimizer, LocalModeOptimizer}; use liquid_cache_datafusion::{ LiquidCacheParquet, LiquidCacheParquetRef, VariantGetUdf, VariantPretty, VariantToJsonUdf, @@ -41,7 +40,7 @@ pub use liquid_cache_common as common; /// let temp_dir = TempDir::new().unwrap(); /// /// let (ctx, _) = LiquidCacheLocalBuilder::new() -/// .with_max_cache_bytes(1024 * 1024 * 1024) // 1GB +/// .with_max_memory_bytes(1024 * 1024 * 1024) // 1GB /// .with_cache_dir(temp_dir.path().to_path_buf()) /// .with_cache_policy(Box::new(LiquidPolicy::new())) /// .build(SessionConfig::new()) @@ -58,8 +57,8 @@ pub use liquid_cache_common as common; pub struct LiquidCacheLocalBuilder { /// Size of batches for caching batch_size: usize, - /// Maximum cache size in bytes - max_cache_bytes: usize, + /// Maximum memory size in bytes + max_memory_bytes: usize, /// Directory for disk cache cache_dir: PathBuf, /// Cache policy @@ -69,21 +68,18 @@ pub struct LiquidCacheLocalBuilder { /// Hydration policy hydration_policy: Box, span: fastrace::Span, - - eager_shredding: bool, } impl Default for LiquidCacheLocalBuilder { fn default() -> Self { Self { batch_size: 8192, - max_cache_bytes: 1024 * 1024 * 1024, // 1GB + max_memory_bytes: 1024 * 1024 * 1024, // 1GB cache_dir: std::env::temp_dir(), cache_policy: Box::new(LiquidPolicy::new()), squeeze_policy: Box::new(TranscodeSqueezeEvict), hydration_policy: Box::new(AlwaysHydrate::new()), span: fastrace::Span::enter_with_local_parent("liquid_cache_datafusion_local_builder"), - eager_shredding: true, } } } @@ -100,9 +96,9 @@ impl LiquidCacheLocalBuilder { self } - /// Set maximum cache size in bytes - pub fn with_max_cache_bytes(mut self, max_cache_bytes: usize) -> Self { - self.max_cache_bytes = max_cache_bytes; + /// Set maximum memory size in bytes + pub fn with_max_memory_bytes(mut self, max_memory_bytes: usize) -> Self { + self.max_memory_bytes = max_memory_bytes; self } @@ -136,12 +132,6 @@ impl LiquidCacheLocalBuilder { self } - /// Set enable shredding - pub fn with_eager_shredding(mut self, eager_shredding: bool) -> Self { - self.eager_shredding = eager_shredding; - self - } - /// Build a SessionContext with liquid cache configured /// Returns the SessionContext and the liquid cache reference pub async fn build( @@ -161,20 +151,35 @@ impl LiquidCacheLocalBuilder { let store = t4::mount(self.cache_dir.join("liquid_cache.t4")) .await .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))?; + #[cfg(not(test))] let cache = LiquidCacheParquet::new( self.batch_size, - self.max_cache_bytes, + self.max_memory_bytes, + usize::MAX, + store, + self.cache_policy, + self.squeeze_policy, + self.hydration_policy, + ) + .await; + + #[cfg(test)] + let cache = LiquidCacheParquet::new_with_squeeze_victim_concurrency( + self.batch_size, + self.max_memory_bytes, + usize::MAX, store, self.cache_policy, self.squeeze_policy, self.hydration_policy, + false, ) .await; let cache_ref = Arc::new(cache); let date_extract_optimizer = Arc::new(LineageOptimizer::new()); - let optimizer = LocalModeOptimizer::new(cache_ref.clone(), self.eager_shredding); + let optimizer = LocalModeOptimizer::new(cache_ref.clone()); let state = datafusion::execution::SessionStateBuilder::new() .with_config(config) diff --git a/src/datafusion-local/src/tests/date_optimizer.rs b/src/datafusion-local/src/tests/date_optimizer.rs index ab421b2b..a3e1edb7 100644 --- a/src/datafusion-local/src/tests/date_optimizer.rs +++ b/src/datafusion-local/src/tests/date_optimizer.rs @@ -37,7 +37,7 @@ async fn general_test(sql: &str) -> CacheStatsSummary { // Set up the session context with liquid cache let lc_builder = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 1024) + .with_max_memory_bytes(1024 * 1024) .with_cache_dir(cache_dir.path().to_path_buf()) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .with_cache_policy(Box::new(liquid_cache::cache_policies::LiquidPolicy::new())); diff --git a/src/datafusion-local/src/tests/mod.rs b/src/datafusion-local/src/tests/mod.rs index 67d2d31a..f92b9852 100644 --- a/src/datafusion-local/src/tests/mod.rs +++ b/src/datafusion-local/src/tests/mod.rs @@ -103,7 +103,7 @@ async fn create_session_context_with_liquid_cache( let mut config = SessionConfig::new(); config.options_mut().execution.target_partitions = 4; let (ctx, cache) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(cache_size_bytes) + .with_max_memory_bytes(cache_size_bytes) .with_cache_dir(cache_dir.to_path_buf()) .with_squeeze_policy(squeeze_policy) .with_cache_policy(Box::new(LiquidPolicy::new())) @@ -342,7 +342,7 @@ async fn test_provide_schema2() { config.options_mut().execution.target_partitions = 4; let (liquid_ctx, cache) = LiquidCacheLocalBuilder::new() .with_cache_dir(cache_dir.path().to_path_buf()) - .with_max_cache_bytes(1024 * 1024) + .with_max_memory_bytes(1024 * 1024) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(config) .await diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__os_selection.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__os_selection.snap index 33cb1a4f..e8cff162 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__os_selection.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__os_selection.snap @@ -51,8 +51,10 @@ RuntimeStatsSnapshot: get_squeezed_needs_io: 1 try_read_liquid_calls: 0 hit_date32_expression_calls: 0 - read_io_count: 1 + read_io_count: 2 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 2141 squeezed_total_count: 2164 diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema2.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema2.snap index 526b2b0e..56fb14bb 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema2.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema2.snap @@ -38,6 +38,8 @@ RuntimeStatsSnapshot: hit_date32_expression_calls: 0 read_io_count: 0 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 0 squeezed_total_count: 0 @@ -181,6 +183,8 @@ RuntimeStatsSnapshot: hit_date32_expression_calls: 0 read_io_count: 0 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 0 squeezed_total_count: 0 @@ -225,6 +229,8 @@ RuntimeStatsSnapshot: hit_date32_expression_calls: 0 read_io_count: 0 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 0 squeezed_total_count: 0 diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema_with_filter.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema_with_filter.snap index f1ffecfd..1cef51ab 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema_with_filter.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__provide_schema_with_filter.snap @@ -54,6 +54,8 @@ RuntimeStatsSnapshot: hit_date32_expression_calls: 0 read_io_count: 0 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 0 squeezed_total_count: 0 diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__referer_filtering.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__referer_filtering.snap index 767b66d3..7ac42fc1 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__referer_filtering.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__referer_filtering.snap @@ -35,13 +35,13 @@ values: stats: entries.total: 8 entries.after_first_run: 8 -entries.memory.arrow: 2 -entries.memory.liquid: 6 +entries.memory.arrow: 0 +entries.memory.liquid: 8 entries.memory.squeezed_liquid: 0 entries.disk.liquid: 0 entries.disk.arrow: 0 -usage.memory_bytes: 885884 -usage.disk_bytes: 729144 +usage.memory_bytes: 884561 +usage.disk_bytes: 877216 RuntimeStatsSnapshot: get: 2 get_with_selection: 2 @@ -50,8 +50,10 @@ RuntimeStatsSnapshot: get_squeezed_needs_io: 0 try_read_liquid_calls: 0 hit_date32_expression_calls: 0 - read_io_count: 0 + read_io_count: 5 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 0 squeezed_total_count: 0 diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__single_column_filter_projection.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__single_column_filter_projection.snap index b5db557e..4f70d628 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__single_column_filter_projection.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__single_column_filter_projection.snap @@ -38,6 +38,8 @@ RuntimeStatsSnapshot: hit_date32_expression_calls: 0 read_io_count: 0 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 0 squeezed_total_count: 0 diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__basic_squeeze.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__basic_squeeze.snap index f9e15ff4..a162c4af 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__basic_squeeze.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__basic_squeeze.snap @@ -9,6 +9,8 @@ event=squeeze_begin victims=[0] event=squeeze_victim entry=0 event=insert_success entry=0 kind=MemoryLiquid event=insert_success entry=262144 kind=MemoryArrow +event=eval_predicate entry=0 selection=true cached=MemoryLiquid +event=read entry=262144 selection=true expr=None cached=MemoryArrow event=insert_failed entry=1 kind=MemoryArrow event=squeeze_begin victims=[262144,0] event=squeeze_victim entry=262144 @@ -23,10 +25,12 @@ event=squeeze_victim entry=1 event=insert_success entry=1 kind=MemoryLiquid event=squeeze_victim entry=262144 event=io_write entry=262144 kind=DiskLiquid bytes=17448 +event=insert_success entry=262144 kind=DiskLiquid event=squeeze_victim entry=0 event=insert_success entry=0 kind=DiskLiquid -event=insert_success entry=262144 kind=DiskLiquid event=insert_success entry=262145 kind=MemoryArrow +event=eval_predicate entry=1 selection=true cached=MemoryLiquid +event=read entry=262145 selection=true expr=None cached=MemoryArrow event=insert_failed entry=2 kind=MemoryArrow event=squeeze_begin victims=[262145,1] event=squeeze_victim entry=262145 @@ -41,64 +45,12 @@ event=squeeze_victim entry=2 event=insert_success entry=2 kind=MemoryLiquid event=squeeze_victim entry=262145 event=io_write entry=262145 kind=DiskLiquid bytes=17448 +event=insert_success entry=262145 kind=DiskLiquid event=squeeze_victim entry=1 event=insert_success entry=1 kind=DiskLiquid -event=insert_success entry=262145 kind=DiskLiquid event=insert_success entry=262146 kind=MemoryArrow -event=eval_predicate entry=0 selection=true cached=DiskLiquid -event=io_read_liquid entry=0 bytes=63528 -event=hydrate entry=0 cached=DiskLiquid new=MemoryLiquid -event=insert_failed entry=0 kind=MemoryLiquid -event=squeeze_begin victims=[262146,2] -event=squeeze_victim entry=262146 -event=insert_success entry=262146 kind=MemoryLiquid -event=squeeze_victim entry=2 -event=io_write entry=2 kind=MemorySqueezedLiquid bytes=63528 -event=insert_success entry=2 kind=MemorySqueezedLiquid -event=insert_success entry=0 kind=MemoryLiquid -event=read entry=262144 selection=true expr=None cached=DiskLiquid -event=io_read_liquid entry=262144 bytes=17448 -event=hydrate entry=262144 cached=DiskLiquid new=MemoryLiquid -event=insert_success entry=262144 kind=MemoryLiquid -event=eval_predicate entry=1 selection=true cached=DiskLiquid -event=io_read_liquid entry=1 bytes=63528 -event=hydrate entry=1 cached=DiskLiquid new=MemoryLiquid -event=insert_failed entry=1 kind=MemoryLiquid -event=squeeze_begin victims=[262146,0,262144,2] -event=squeeze_victim entry=262146 -event=io_write entry=262146 kind=DiskLiquid bytes=17448 -event=squeeze_victim entry=0 -event=io_write entry=0 kind=MemorySqueezedLiquid bytes=63528 -event=squeeze_victim entry=262144 -event=io_write entry=262144 kind=DiskLiquid bytes=17448 -event=squeeze_victim entry=2 -event=insert_success entry=2 kind=DiskLiquid -event=insert_success entry=262146 kind=DiskLiquid -event=insert_success entry=0 kind=MemorySqueezedLiquid -event=insert_success entry=262144 kind=DiskLiquid -event=insert_success entry=1 kind=MemoryLiquid -event=read entry=262145 selection=true expr=None cached=DiskLiquid -event=io_read_liquid entry=262145 bytes=17448 -event=hydrate entry=262145 cached=DiskLiquid new=MemoryLiquid -event=insert_success entry=262145 kind=MemoryLiquid -event=eval_predicate entry=2 selection=true cached=DiskLiquid -event=io_read_liquid entry=2 bytes=63528 -event=hydrate entry=2 cached=DiskLiquid new=MemoryLiquid -event=insert_failed entry=2 kind=MemoryLiquid -event=squeeze_begin victims=[1,262145,0] -event=squeeze_victim entry=1 -event=io_write entry=1 kind=MemorySqueezedLiquid bytes=63528 -event=squeeze_victim entry=262145 -event=io_write entry=262145 kind=DiskLiquid bytes=17448 -event=squeeze_victim entry=0 -event=insert_success entry=0 kind=DiskLiquid -event=insert_success entry=1 kind=MemorySqueezedLiquid -event=insert_success entry=262145 kind=DiskLiquid -event=insert_success entry=2 kind=MemoryLiquid -event=read entry=262146 selection=true expr=None cached=DiskLiquid -event=io_read_liquid entry=262146 bytes=17448 -event=hydrate entry=262146 cached=DiskLiquid new=MemoryLiquid -event=insert_success entry=262146 kind=MemoryLiquid +event=eval_predicate entry=2 selection=true cached=MemoryLiquid +event=read entry=262146 selection=true expr=None cached=MemoryArrow event=insert_success entry=4294967296 kind=MemoryArrow event=insert_success entry=4295229440 kind=MemoryArrow event=eval_predicate entry=4294967296 selection=true cached=MemoryArrow diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_distinct_search_phase.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_distinct_search_phase.snap index 80af5e42..e41d5bba 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_distinct_search_phase.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_distinct_search_phase.snap @@ -4,11 +4,15 @@ expression: trace --- EventTrace: [ event=insert_success entry=2555904 kind=MemoryArrow +event=eval_predicate entry=2555904 selection=true cached=MemoryArrow +event=read entry=2555904 selection=true expr=None cached=MemoryArrow event=insert_failed entry=2555905 kind=MemoryArrow event=squeeze_begin victims=[2555904] event=squeeze_victim entry=2555904 event=insert_success entry=2555904 kind=MemoryLiquid event=insert_success entry=2555905 kind=MemoryArrow +event=eval_predicate entry=2555905 selection=true cached=MemoryArrow +event=read entry=2555905 selection=true expr=None cached=MemoryArrow event=insert_failed entry=2555906 kind=MemoryArrow event=squeeze_begin victims=[2555905,2555904] event=squeeze_victim entry=2555905 @@ -17,11 +21,6 @@ event=squeeze_victim entry=2555904 event=io_write entry=2555904 kind=MemorySqueezedLiquid bytes=27320 event=insert_success entry=2555904 kind=MemorySqueezedLiquid event=insert_success entry=2555906 kind=MemoryArrow -event=eval_predicate entry=2555904 selection=true cached=MemorySqueezedLiquid -event=read entry=2555904 selection=true expr=None cached=MemorySqueezedLiquid -event=io_read_squeezed_backing entry=2555904 bytes=27320 -event=eval_predicate entry=2555905 selection=true cached=MemoryLiquid -event=read entry=2555905 selection=true expr=None cached=MemoryLiquid event=eval_predicate entry=2555906 selection=true cached=MemoryArrow event=read entry=2555906 selection=true expr=None cached=MemoryArrow event=insert_success entry=4297523200 kind=MemoryArrow diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_strings.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_strings.snap index 4451f2bd..2943782b 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_strings.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_strings.snap @@ -15,24 +15,34 @@ event=io_write entry=851968 kind=DiskLiquid bytes=139416 event=insert_success entry=851968 kind=DiskLiquid event=insert_failed entry=917504 kind=MemoryArrow event=insert_success entry=917504 kind=MemoryLiquid +event=eval_predicate entry=917504 selection=true cached=MemoryLiquid +event=read entry=851968 selection=true expr=None cached=DiskLiquid +event=io_read_liquid entry=851968 bytes=139416 +event=hydrate entry=851968 cached=DiskLiquid new=MemoryLiquid +event=insert_success entry=851968 kind=MemoryLiquid event=insert_success entry=851969 kind=MemoryArrow event=insert_failed entry=917505 kind=MemoryArrow -event=squeeze_begin victims=[851969,917504] +event=squeeze_begin victims=[851969,917504,851968] event=squeeze_victim entry=851969 event=insert_success entry=851969 kind=MemoryLiquid event=squeeze_victim entry=917504 event=io_write entry=917504 kind=MemorySqueezedLiquid bytes=136440 event=insert_success entry=917504 kind=MemorySqueezedLiquid +event=squeeze_victim entry=851968 +event=io_write entry=851968 kind=DiskLiquid bytes=139416 +event=insert_success entry=851968 kind=DiskLiquid event=insert_success entry=917505 kind=MemoryArrow +event=eval_predicate entry=917505 selection=true cached=MemoryArrow +event=read entry=851969 selection=true expr=None cached=MemoryLiquid event=insert_failed entry=851970 kind=MemoryArrow event=squeeze_begin victims=[917505,851969,917504] event=squeeze_victim entry=917505 event=insert_success entry=917505 kind=MemoryLiquid event=squeeze_victim entry=851969 event=io_write entry=851969 kind=DiskLiquid bytes=139376 +event=insert_success entry=851969 kind=DiskLiquid event=squeeze_victim entry=917504 event=insert_success entry=917504 kind=DiskLiquid -event=insert_success entry=851969 kind=DiskLiquid event=insert_success entry=851970 kind=MemoryArrow event=insert_failed entry=917506 kind=MemoryArrow event=squeeze_begin victims=[851970,917505] @@ -45,34 +55,18 @@ event=insert_failed entry=917506 kind=MemoryArrow event=squeeze_begin victims=[851970,917505] event=squeeze_victim entry=851970 event=io_write entry=851970 kind=DiskLiquid bytes=146184 +event=insert_success entry=851970 kind=DiskLiquid event=squeeze_victim entry=917505 event=insert_success entry=917505 kind=DiskLiquid -event=insert_success entry=851970 kind=DiskLiquid event=insert_success entry=917506 kind=MemoryArrow -event=eval_predicate entry=917504 selection=true cached=DiskLiquid -event=io_read_liquid entry=917504 bytes=136440 -event=hydrate entry=917504 cached=DiskLiquid new=MemoryLiquid -event=insert_failed entry=917504 kind=MemoryLiquid -event=squeeze_begin victims=[917506] -event=squeeze_victim entry=917506 -event=insert_success entry=917506 kind=MemoryLiquid -event=insert_success entry=917504 kind=MemoryLiquid -event=read entry=851968 selection=true expr=None cached=DiskLiquid -event=io_read_liquid entry=851968 bytes=139416 -event=hydrate entry=851968 cached=DiskLiquid new=MemoryLiquid -event=insert_success entry=851968 kind=MemoryLiquid -event=eval_predicate entry=917505 selection=true cached=DiskLiquid -event=io_read_liquid entry=917505 bytes=141576 -event=hydrate entry=917505 cached=DiskLiquid new=MemoryLiquid -event=insert_success entry=917505 kind=MemoryLiquid -event=read entry=851969 selection=true expr=None cached=DiskLiquid -event=io_read_liquid entry=851969 bytes=139376 -event=hydrate entry=851969 cached=DiskLiquid new=MemoryLiquid -event=insert_success entry=851969 kind=MemoryLiquid -event=eval_predicate entry=917506 selection=true cached=MemoryLiquid +event=eval_predicate entry=917506 selection=true cached=MemoryArrow event=read entry=851970 selection=true expr=None cached=DiskLiquid event=io_read_liquid entry=851970 bytes=146184 event=hydrate entry=851970 cached=DiskLiquid new=MemoryLiquid +event=insert_failed entry=851970 kind=MemoryLiquid +event=squeeze_begin victims=[917506] +event=squeeze_victim entry=917506 +event=insert_success entry=917506 kind=MemoryLiquid event=insert_success entry=851970 kind=MemoryLiquid event=insert_success entry=4295819264 kind=MemoryArrow event=insert_success entry=4295884800 kind=MemoryArrow diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search.snap index dd377dfa..14181814 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search.snap @@ -4,11 +4,13 @@ expression: trace --- EventTrace: [ event=insert_success entry=2555904 kind=MemoryArrow +event=eval_predicate entry=2555904 selection=true cached=MemoryArrow event=insert_failed entry=2555905 kind=MemoryArrow event=squeeze_begin victims=[2555904] event=squeeze_victim entry=2555904 event=insert_success entry=2555904 kind=MemoryLiquid event=insert_success entry=2555905 kind=MemoryArrow +event=eval_predicate entry=2555905 selection=true cached=MemoryArrow event=insert_failed entry=2555906 kind=MemoryArrow event=squeeze_begin victims=[2555905,2555904] event=squeeze_victim entry=2555905 @@ -17,8 +19,6 @@ event=squeeze_victim entry=2555904 event=io_write entry=2555904 kind=MemorySqueezedLiquid bytes=28612 event=insert_success entry=2555904 kind=MemorySqueezedLiquid event=insert_success entry=2555906 kind=MemoryArrow -event=eval_predicate entry=2555904 selection=true cached=MemorySqueezedLiquid -event=eval_predicate entry=2555905 selection=true cached=MemoryLiquid event=eval_predicate entry=2555906 selection=true cached=MemoryArrow event=insert_success entry=4297523200 kind=MemoryArrow event=eval_predicate entry=4297523200 selection=true cached=MemoryArrow diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search_title.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search_title.snap index c8bba373..505745a0 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search_title.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__squeeze__squeeze_substrings_search_title.snap @@ -4,11 +4,13 @@ expression: trace --- EventTrace: [ event=insert_success entry=131072 kind=MemoryArrow +event=eval_predicate entry=131072 selection=true cached=MemoryArrow event=insert_failed entry=131073 kind=MemoryArrow event=squeeze_begin victims=[131072] event=squeeze_victim entry=131072 event=insert_success entry=131072 kind=MemoryLiquid event=insert_success entry=131073 kind=MemoryArrow +event=eval_predicate entry=131073 selection=true cached=MemoryArrow event=insert_failed entry=131074 kind=MemoryArrow event=squeeze_begin victims=[131073,131072] event=squeeze_victim entry=131073 @@ -17,10 +19,6 @@ event=squeeze_victim entry=131072 event=io_write entry=131072 kind=MemorySqueezedLiquid bytes=257888 event=insert_success entry=131072 kind=MemorySqueezedLiquid event=insert_success entry=131074 kind=MemoryArrow -event=eval_predicate entry=131072 selection=true cached=MemorySqueezedLiquid -event=io_read_squeezed_backing entry=131072 bytes=257888 -event=decompress_squeezed entry=131072 decompressed=1054 total=1724 -event=eval_predicate entry=131073 selection=true cached=MemoryLiquid event=eval_predicate entry=131074 selection=true cached=MemoryArrow event=insert_success entry=4295098368 kind=MemoryArrow event=eval_predicate entry=4295098368 selection=true cached=MemoryArrow diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_prefix_filtering.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_prefix_filtering.snap index dc545bd8..d4817712 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_prefix_filtering.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_prefix_filtering.snap @@ -68,6 +68,8 @@ RuntimeStatsSnapshot: hit_date32_expression_calls: 0 read_io_count: 1 write_io_count: 0 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 squeezed_decompressed_count: 0 squeezed_total_count: 0 diff --git a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_selection_and_ordering.snap b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_selection_and_ordering.snap index f0a631fa..304f9894 100644 --- a/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_selection_and_ordering.snap +++ b/src/datafusion-local/src/tests/snapshots/liquid_cache_datafusion_local__tests__url_selection_and_ordering.snap @@ -36,24 +36,26 @@ values: stats: entries.total: 4 entries.after_first_run: 4 -entries.memory.arrow: 1 -entries.memory.liquid: 1 +entries.memory.arrow: 0 +entries.memory.liquid: 2 entries.memory.squeezed_liquid: 2 entries.disk.liquid: 0 entries.disk.arrow: 0 -usage.memory_bytes: 227389 +usage.memory_bytes: 226744 usage.disk_bytes: 564392 RuntimeStatsSnapshot: get: 3 get_with_selection: 3 eval_predicate: 4 get_squeezed_success: 0 - get_squeezed_needs_io: 4 + get_squeezed_needs_io: 2 try_read_liquid_calls: 0 hit_date32_expression_calls: 0 read_io_count: 4 - write_io_count: 0 + write_io_count: 2 + disk_evictions: 0 + disk_reservation_failures: 0 eval_predicate_on_liquid_failed: 0 - squeezed_decompressed_count: 4362 - squeezed_total_count: 8814 + squeezed_decompressed_count: 2220 + squeezed_total_count: 4486 squeeze_io_saved: 0 diff --git a/src/datafusion-local/src/tests/squeeze.rs b/src/datafusion-local/src/tests/squeeze.rs index 6a12b4a7..73b6d686 100644 --- a/src/datafusion-local/src/tests/squeeze.rs +++ b/src/datafusion-local/src/tests/squeeze.rs @@ -10,7 +10,7 @@ const TEST_FILE: &str = "../../examples/nano_hits.parquet"; async fn basic_squeeze() { let cache_dir = TempDir::new().unwrap(); let (ctx, cache) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 128) + .with_max_memory_bytes(1024 * 128) .with_cache_dir(cache_dir.path().to_path_buf()) .build(SessionConfig::new()) .await @@ -37,7 +37,7 @@ async fn basic_squeeze() { async fn squeeze_strings() { let cache_dir = TempDir::new().unwrap(); let (ctx, cache) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 1024) + .with_max_memory_bytes(1024 * 1024) .with_cache_dir(cache_dir.path().to_path_buf()) .build(SessionConfig::new()) .await @@ -64,7 +64,7 @@ async fn squeeze_strings() { async fn squeeze_substrings_search() { let cache_dir = TempDir::new().unwrap(); let (ctx, cache) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 256) + .with_max_memory_bytes(1024 * 256) .with_cache_dir(cache_dir.path().to_path_buf()) .build(SessionConfig::new()) .await @@ -88,7 +88,7 @@ async fn squeeze_substrings_search() { async fn squeeze_substrings_search_title() { let cache_dir = TempDir::new().unwrap(); let (ctx, cache) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 1024 * 4) + .with_max_memory_bytes(1024 * 1024 * 4) .with_cache_dir(cache_dir.path().to_path_buf()) .build(SessionConfig::new()) .await @@ -113,7 +113,7 @@ async fn squeeze_substrings_search_title() { async fn squeeze_distinct_search_phase() { let cache_dir = TempDir::new().unwrap(); let (ctx, cache) = LiquidCacheLocalBuilder::new() - .with_max_cache_bytes(1024 * 256) + .with_max_memory_bytes(1024 * 256) .with_cache_dir(cache_dir.path().to_path_buf()) .build(SessionConfig::new()) .await diff --git a/src/datafusion-local/src/tests/variants.rs b/src/datafusion-local/src/tests/variants.rs index e74b0c03..aa550ab5 100644 --- a/src/datafusion-local/src/tests/variants.rs +++ b/src/datafusion-local/src/tests/variants.rs @@ -170,7 +170,7 @@ async fn test_variant_transcoding_falls_back_to_disk_arrow() { let (ctx, cache) = LiquidCacheLocalBuilder::new() .with_batch_size(1) - .with_max_cache_bytes(64) + .with_max_memory_bytes(64) .with_cache_dir(cache_dir.path().to_path_buf()) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(SessionConfig::new()) @@ -376,7 +376,7 @@ async fn test_large_variant_squeeze() { let (ctx, _cache) = LiquidCacheLocalBuilder::new() .with_cache_dir(cache_dir.path().to_path_buf()) - .with_max_cache_bytes(1024) + .with_max_memory_bytes(1024) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(SessionConfig::new()) .await @@ -416,7 +416,7 @@ async fn variant_multi_queries() { let (ctx, _cache) = LiquidCacheLocalBuilder::new() .with_cache_dir(cache_dir.path().to_path_buf()) - .with_max_cache_bytes(1024) + .with_max_memory_bytes(1024) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(SessionConfig::new()) .await @@ -465,7 +465,7 @@ async fn variant_multi_queries_complex() { let (ctx, cache) = LiquidCacheLocalBuilder::new() .with_cache_dir(cache_dir.path().to_path_buf()) - .with_max_cache_bytes(1024 * 600) + .with_max_memory_bytes(1024 * 600) .with_batch_size(8) .with_squeeze_policy(Box::new(TranscodeSqueezeEvict)) .build(SessionConfig::new()) diff --git a/src/datafusion-server/Cargo.toml b/src/datafusion-server/Cargo.toml index a0101d22..556f21dd 100644 --- a/src/datafusion-server/Cargo.toml +++ b/src/datafusion-server/Cargo.toml @@ -23,7 +23,7 @@ liquid-cache-datafusion = { workspace = true } object_store = { workspace = true, features = ["aws", "http"] } liquid-cache-common = { workspace = true } tempfile = { workspace = true } -axum = "0.8.8" +axum = "0.8.9" serde = { workspace = true } tower-http = { version = "0.6.8", features = ["cors"] } sysinfo = { version = "0.38.4", default-features = false, features = [ diff --git a/src/datafusion-server/src/admin_server/handlers.rs b/src/datafusion-server/src/admin_server/handlers.rs index 87e4e12e..55efd0a5 100644 --- a/src/datafusion-server/src/admin_server/handlers.rs +++ b/src/datafusion-server/src/admin_server/handlers.rs @@ -121,7 +121,7 @@ pub(crate) async fn get_parquet_cache_usage_handler( #[derive(Serialize)] pub(crate) struct CacheInfo { batch_size: usize, - max_cache_bytes: u64, + max_memory_bytes: u64, memory_usage_bytes: u64, disk_usage_bytes: u64, } @@ -130,12 +130,12 @@ pub(crate) async fn get_cache_info_handler(State(state): State>) - info!("Getting cache info..."); let cache = state.liquid_cache.cache(); let batch_size = cache.batch_size(); - let max_cache_bytes = cache.max_cache_bytes() as u64; + let max_memory_bytes = cache.max_memory_bytes() as u64; let memory_usage_bytes = cache.memory_usage_bytes() as u64; let disk_usage_bytes = cache.disk_usage_bytes() as u64; Json(CacheInfo { batch_size, - max_cache_bytes, + max_memory_bytes, memory_usage_bytes, disk_usage_bytes, }) diff --git a/src/datafusion-server/src/lib.rs b/src/datafusion-server/src/lib.rs index abba734b..3a5ee2a1 100644 --- a/src/datafusion-server/src/lib.rs +++ b/src/datafusion-server/src/lib.rs @@ -124,11 +124,11 @@ impl LiquidCacheService { /// # Arguments /// /// * `ctx` - The [SessionContext] to use - /// * `max_cache_bytes` - The maximum number of bytes to cache in memory + /// * `max_memory_bytes` - The maximum number of bytes to cache in memory /// * `disk_cache_dir` - The directory to store the disk cache pub async fn new( ctx: SessionContext, - max_cache_bytes: Option, + max_memory_bytes: Option, disk_cache_dir: Option, cache_policy: Box, squeeze_policy: Box, @@ -145,7 +145,7 @@ impl LiquidCacheService { Ok(Self { inner: LiquidCacheServiceInner::new( Arc::new(ctx), - max_cache_bytes, + max_memory_bytes, disk_cache_dir, cache_policy, squeeze_policy, diff --git a/src/datafusion-server/src/service.rs b/src/datafusion-server/src/service.rs index 1d9cbe46..1ccfe76d 100644 --- a/src/datafusion-server/src/service.rs +++ b/src/datafusion-server/src/service.rs @@ -46,7 +46,7 @@ pub(crate) struct LiquidCacheServiceInner { impl LiquidCacheServiceInner { pub async fn new( default_ctx: Arc, - max_cache_bytes: Option, + max_memory_bytes: Option, disk_cache_dir: PathBuf, cache_policy: Box, squeeze_policy: Box, @@ -64,7 +64,8 @@ impl LiquidCacheServiceInner { let liquid_cache = Arc::new( LiquidCacheParquet::new( batch_size, - max_cache_bytes.unwrap_or(usize::MAX), + max_memory_bytes.unwrap_or(usize::MAX), + usize::MAX, store, cache_policy, squeeze_policy, @@ -150,7 +151,7 @@ impl LiquidCacheServiceInner { let cache = self.cache(); self.execution_plans.write().unwrap().insert( handle, - ExecutionPlanEntry::new(rewrite_data_source_plan(plan, cache, true)), + ExecutionPlanEntry::new(rewrite_data_source_plan(plan, cache)), ); } diff --git a/src/datafusion/Cargo.toml b/src/datafusion/Cargo.toml index 9a927edb..0dfd8d59 100644 --- a/src/datafusion/Cargo.toml +++ b/src/datafusion/Cargo.toml @@ -21,10 +21,7 @@ object_store = { workspace = true, features = ["http"] } liquid-cache-common = { workspace = true } liquid-cache = { workspace = true } fastrace = { workspace = true } -async-trait = { workspace = true } parquet-variant-json = { workspace = true } -parquet-variant-compute = { workspace = true } -serde = { workspace = true } serde_json = { workspace = true } t4 = { workspace = true } @@ -32,8 +29,8 @@ t4 = { workspace = true } [dev-dependencies] tempfile = "3.27.0" divan = "0.1" -rand = "0.10.0" -shuttle = "0.8.1" +rand = "0.10.1" +shuttle = "0.9.1" tokio-test = "0.4" serde_json = { workspace = true } diff --git a/src/datafusion/bench/filter_pushdown.rs b/src/datafusion/bench/filter_pushdown.rs index 7488865a..897d8e2e 100644 --- a/src/datafusion/bench/filter_pushdown.rs +++ b/src/datafusion/bench/filter_pushdown.rs @@ -43,7 +43,8 @@ fn setup_cache() -> (Arc, tempfile::TempDir) { let store = tokio_test::block_on(t4::mount(&store_path)).expect("failed to mount t4 store"); let cache = tokio_test::block_on(LiquidCacheParquet::new( BATCH_SIZE, - 1024 * 1024 * 1024, // max_cache_bytes (1GB) + 1024 * 1024 * 1024, // max_memory_bytes (1GB) + usize::MAX, store, Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), diff --git a/src/datafusion/src/cache/column.rs b/src/datafusion/src/cache/column.rs index a4a1f71d..65870a78 100644 --- a/src/datafusion/src/cache/column.rs +++ b/src/datafusion/src/cache/column.rs @@ -1,22 +1,17 @@ use arrow::{ - array::{Array, ArrayRef, AsArray, BooleanArray}, + array::{Array, ArrayRef, BooleanArray}, buffer::BooleanBuffer, compute::prep_null_mask_filter, record_batch::RecordBatch, }; use arrow_schema::{ArrowError, DataType, Field, Schema}; -use liquid_cache::cache::{CacheExpression, LiquidCache, LiquidExpr}; -use liquid_cache::utils::VariantSchema; -use liquid_cache::utils::typed_struct_contains_path; +use liquid_cache::cache::{CacheExpression, CacheFull, LiquidCache, LiquidExpr}; use parquet::arrow::arrow_reader::ArrowPredicate; -use parquet_variant_compute::{VariantArray, VariantType, shred_variant, unshred_variant}; use crate::{ LiquidPredicate, cache::{BatchID, ColumnAccessPath, ParquetArrayID}, - optimizers::{ - DATE_MAPPING_METADATA_KEY, STRING_FINGERPRINT_METADATA_KEY, variant_mappings_from_field, - }, + optimizers::{DATE_MAPPING_METADATA_KEY, STRING_FINGERPRINT_METADATA_KEY}, }; use std::sync::Arc; @@ -49,17 +44,6 @@ fn infer_expression(field: &Field) -> Option { { return Some(CacheExpression::substring_search()); } - if field.try_extension_type::().is_ok() - && let Some(mappings) = variant_mappings_from_field(field) - { - let typed_specs: Vec<_> = mappings - .into_iter() - .filter_map(|mapping| mapping.data_type.map(|data_type| (mapping.path, data_type))) - .collect(); - if !typed_specs.is_empty() { - return Some(CacheExpression::variant_get_many(typed_specs)); - } - } None } @@ -68,6 +52,14 @@ fn infer_expression(field: &Field) -> Option { pub enum InsertArrowArrayError { /// The array is already cached. AlreadyCached, + /// The cache does not have enough disk budget to accept the array. + CacheFull, +} + +impl From for InsertArrowArrayError { + fn from(_: CacheFull) -> Self { + Self::CacheFull + } } impl CachedColumn { @@ -180,17 +172,12 @@ impl CachedColumn { filter: &BooleanBuffer, ) -> Option { let entry_id = self.entry_id(batch_id).into(); - let mut array = self - .cache_store + self.cache_store .get(&entry_id) .with_selection(filter) .with_optional_expression_hint(self.expression()) .read() - .await?; - if let Some(transformed) = maybe_shred_variant_array(&array, self.field.as_ref()) { - array = transformed; - } - Some(array) + .await } #[cfg(test)] @@ -209,75 +196,13 @@ impl CachedColumn { return Err(InsertArrowArrayError::AlreadyCached); } - let mut array = array; - if let Some(transformed) = maybe_shred_variant_array(&array, self.field.as_ref()) { - array = transformed; - } self.cache_store .insert(self.entry_id(batch_id).into(), array) - .await; + .await?; Ok(()) } } -fn maybe_shred_variant_array(array: &ArrayRef, field: &Field) -> Option { - let mappings = variant_mappings_from_field(field)?; - let typed_specs: Vec<(String, DataType)> = mappings - .into_iter() - .filter_map(|mapping| mapping.data_type.map(|data_type| (mapping.path, data_type))) - .collect(); - if typed_specs.is_empty() { - return None; - } - shred_variant_array(array, field, &typed_specs) -} - -fn shred_variant_array( - array: &ArrayRef, - field: &Field, - specs: &[(String, DataType)], -) -> Option { - if specs.is_empty() { - return None; - } - - let variant_array = VariantArray::try_new(array.as_ref()).ok()?; - let missing_specs: Vec<_> = specs - .iter() - .filter(|(path, _)| !variant_contains_typed_field(&variant_array, path)) - .collect(); - if missing_specs.is_empty() { - return None; - } - - let target_fields = match field.data_type() { - DataType::Struct(fields) => fields.clone(), - _ => return None, - }; - let typed_schema = target_fields - .iter() - .find(|child| child.name() == "typed_value") - .cloned()?; - let mut schema = VariantSchema::new(Some(typed_schema.as_ref())); - for (path, data_type) in missing_specs { - schema.insert_path(path, data_type); - } - let shredding_schema = schema.shredding_type()?; - let unshredded = unshred_variant(&variant_array).ok()?; - let shredded = shred_variant(&unshredded, &shredding_schema).ok()?; - Some(Arc::new(shredded.into_inner())) -} - -fn variant_contains_typed_field(array: &VariantArray, path: &str) -> bool { - let Some(typed_field) = array.typed_value_field() else { - return false; - }; - let Some(typed_root) = typed_field.as_struct_opt() else { - return false; - }; - typed_struct_contains_path(typed_root, path) -} - fn is_string_type(data_type: &DataType) -> bool { match data_type { DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => true, @@ -285,88 +210,3 @@ fn is_string_type(data_type: &DataType) -> bool { _ => false, } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::optimizers::{ - VARIANT_MAPPING_METADATA_KEY, VariantField, enrich_variant_field_type, - }; - use arrow::array::{ArrayRef, StringArray, StructArray}; - use parquet::variant::{VariantType, json_to_variant}; - use serde_json::json; - use std::collections::HashMap; - - #[test] - fn shredding_adds_all_variant_paths() { - let values = StringArray::from(vec![ - Some(r#"{"name":"Alice","age":30}"#), - Some(r#"{"name":"Bob","age":27}"#), - ]); - let variant = json_to_variant(&(Arc::new(values) as ArrayRef)).expect("variant"); - - let mut metadata = HashMap::new(); - metadata.insert( - VARIANT_MAPPING_METADATA_KEY.to_string(), - serde_json::to_string(&vec![ - json!({"path": "name", "type": "Utf8"}), - json!({"path": "age", "type": "Int64"}), - ]) - .unwrap(), - ); - - let variant_fields = vec![ - VariantField { - path: "name".to_string(), - data_type: Some(DataType::Utf8), - }, - VariantField { - path: "age".to_string(), - data_type: Some(DataType::Int64), - }, - ]; - - let base_field = Field::new("variant", variant.inner().data_type().clone(), true) - .with_extension_type(VariantType) - .with_metadata(metadata); - let enriched = enrich_variant_field_type(base_field.as_ref(), &variant_fields) - .with_metadata(base_field.metadata().clone()); - let array: ArrayRef = ArrayRef::from(variant); - - let shredded = maybe_shred_variant_array(&array, enriched.as_ref()) - .expect("variant should be shredded"); - let shredded_struct = shredded - .as_any() - .downcast_ref::() - .expect("struct array"); - let typed_value = shredded_struct - .column_by_name("typed_value") - .expect("typed_value column"); - let typed_struct = typed_value - .as_any() - .downcast_ref::() - .expect("typed struct"); - - let name_struct = typed_struct - .column_by_name("name") - .expect("name path") - .as_any() - .downcast_ref::() - .expect("name struct"); - let name_values = name_struct - .column_by_name("typed_value") - .expect("name typed value"); - assert_eq!(name_values.data_type(), &DataType::Utf8); - - let age_struct = typed_struct - .column_by_name("age") - .expect("age path") - .as_any() - .downcast_ref::() - .expect("age struct"); - let age_values = age_struct - .column_by_name("typed_value") - .expect("age typed value"); - assert_eq!(age_values.data_type(), &DataType::Int64); - } -} diff --git a/src/datafusion/src/cache/mod.rs b/src/datafusion/src/cache/mod.rs index b9ad5658..d8af7a41 100644 --- a/src/datafusion/src/cache/mod.rs +++ b/src/datafusion/src/cache/mod.rs @@ -1,7 +1,7 @@ //! This module contains the cache implementation for the Parquet reader. //! -use crate::io::ParquetIoContext; +use crate::io::ParquetCacheMetadata; use crate::reader::{LiquidPredicate, extract_multi_column_or}; use crate::sync::Mutex; use ahash::AHashMap; @@ -243,21 +243,51 @@ impl LiquidCacheParquet { /// Create a new cache for parquet files. pub async fn new( batch_size: usize, - max_cache_bytes: usize, + max_memory_bytes: usize, + max_disk_bytes: usize, store: t4::Store, cache_policy: Box, squeeze_policy: Box, hydration_policy: Box, + ) -> Self { + Self::new_with_squeeze_victim_concurrency( + batch_size, + max_memory_bytes, + max_disk_bytes, + store, + cache_policy, + squeeze_policy, + hydration_policy, + !cfg!(test), + ) + .await + } + + /// Create a new cache for parquet files with explicit victim squeeze concurrency. + #[doc(hidden)] + #[allow(clippy::too_many_arguments)] + pub async fn new_with_squeeze_victim_concurrency( + batch_size: usize, + max_memory_bytes: usize, + max_disk_bytes: usize, + store: t4::Store, + cache_policy: Box, + squeeze_policy: Box, + hydration_policy: Box, + squeeze_victims_concurrently: bool, ) -> Self { assert!(batch_size.is_power_of_two()); - let io_context = Arc::new(ParquetIoContext::new(store)); + let metadata = Arc::new(ParquetCacheMetadata::new()); let cache_storage = LiquidCacheBuilder::new() .with_batch_size(batch_size) - .with_max_cache_bytes(max_cache_bytes) + .with_max_memory_bytes(max_memory_bytes) + .with_max_disk_bytes(max_disk_bytes) .with_squeeze_policy(squeeze_policy) .with_cache_policy(cache_policy) .with_hydration_policy(hydration_policy) - .with_io_context(io_context) + .with_metadata(metadata) + .with_store(store) + .with_squeeze_victims_concurrently(squeeze_victims_concurrently) .build() .await; @@ -292,9 +322,14 @@ impl LiquidCacheParquet { self.cache_store.config().batch_size() } - /// Get the max cache bytes of the cache. - pub fn max_cache_bytes(&self) -> usize { - self.cache_store.config().max_cache_bytes() + /// Get the max memory bytes of the cache. + pub fn max_memory_bytes(&self) -> usize { + self.cache_store.config().max_memory_bytes() + } + + /// Get the max disk bytes of the cache. + pub fn max_disk_bytes(&self) -> usize { + self.cache_store.config().max_disk_bytes() } /// Get the memory usage of the cache in bytes. @@ -340,8 +375,8 @@ impl LiquidCacheParquet { /// This is for admin use only. /// This has no guarantees that some new entry will not be inserted in the meantime, or some entries are promoted to memory again. /// You mostly want to use this when no one else is using the cache. - pub async fn flush_data(&self) { - self.cache_store.flush_all_to_disk().await; + pub async fn flush_data(&self) -> Result<(), liquid_cache::cache::CacheFull> { + self.cache_store.flush_all_to_disk().await } /// Get the storage of the cache. @@ -384,6 +419,7 @@ mod tests { let cache = LiquidCacheParquet::new( batch_size, usize::MAX, + usize::MAX, store, Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), diff --git a/src/datafusion/src/cache/stats.rs b/src/datafusion/src/cache/stats.rs index ef5603fe..9527f08d 100644 --- a/src/datafusion/src/cache/stats.rs +++ b/src/datafusion/src/cache/stats.rs @@ -129,15 +129,15 @@ impl LiquidCacheParquet { CacheEntry::MemoryArrow(array) => Some(array.len() as u64), CacheEntry::MemoryLiquid(array) => Some(array.len() as u64), CacheEntry::MemorySqueezedLiquid(array) => Some(array.len() as u64), - CacheEntry::DiskLiquid(_) => None, - CacheEntry::DiskArrow(_) => None, // We'd need to read it to get the count + CacheEntry::DiskLiquid { .. } => None, + CacheEntry::DiskArrow { .. } => None, // We'd need to read it to get the count }; let cache_type = match cached_batch { CacheEntry::MemoryArrow(_) => "InMemory", CacheEntry::MemoryLiquid(_) => "LiquidMemory", CacheEntry::MemorySqueezedLiquid(_) => "LiquidSqueezed", - CacheEntry::DiskLiquid(_) => "OnDiskLiquid", - CacheEntry::DiskArrow(_) => "OnDiskArrow", + CacheEntry::DiskLiquid { .. } => "OnDiskLiquid", + CacheEntry::DiskArrow { .. } => "OnDiskArrow", }; let reference_count = cached_batch.reference_count(); let entry_id = ParquetArrayID::from(*entry_id); @@ -188,6 +188,7 @@ mod tests { let cache = LiquidCacheParquet::new( 1024, usize::MAX, + usize::MAX, store, Box::new(LiquidPolicy::new()), Box::new(Evict), diff --git a/src/datafusion/src/io/mod.rs b/src/datafusion/src/io/mod.rs index 667a84c0..0982379f 100644 --- a/src/datafusion/src/io/mod.rs +++ b/src/datafusion/src/io/mod.rs @@ -1,34 +1,22 @@ use std::{ collections::VecDeque, - ops::Range, sync::{Arc, RwLock}, }; use ahash::AHashMap; -use bytes::Bytes; -use liquid_cache::cache::{CacheExpression, EntryID, IoContext, LiquidCompressorStates}; +use liquid_cache::cache::{CacheExpression, EntryID, EntryMetadata, LiquidCompressorStates}; use crate::cache::{ColumnAccessPath, ParquetArrayID}; -/// Convert an [`EntryID`] to a t4 key (8-byte little-endian representation). -fn entry_id_to_key(entry_id: &EntryID) -> Vec { - usize::from(*entry_id).to_le_bytes().to_vec() -} - -#[derive(Debug)] -pub(crate) struct ParquetIoContext { +#[derive(Debug, Default)] +pub(crate) struct ParquetCacheMetadata { compressor_states: RwLock>>, expression_hints: RwLock>, - store: t4::Store, } -impl ParquetIoContext { - pub fn new(store: t4::Store) -> Self { - Self { - compressor_states: RwLock::new(AHashMap::new()), - expression_hints: RwLock::new(AHashMap::new()), - store, - } +impl ParquetCacheMetadata { + pub fn new() -> Self { + Self::default() } } @@ -67,8 +55,7 @@ impl ColumnExpressionTracker { } } -#[async_trait::async_trait] -impl IoContext for ParquetIoContext { +impl EntryMetadata for ParquetCacheMetadata { fn add_squeeze_hint(&self, entry_id: &EntryID, expression: Arc) { let column_path = ColumnAccessPath::from(ParquetArrayID::from(*entry_id)); let mut guard = self.expression_hints.write().unwrap(); @@ -92,46 +79,6 @@ impl IoContext for ParquetIoContext { .or_insert_with(|| Arc::new(LiquidCompressorStates::new())) .clone() } - - #[inline(never)] - #[fastrace::trace] - async fn read( - &self, - entry_id: &EntryID, - range: Option>, - ) -> Result { - let key = entry_id_to_key(entry_id); - match range { - Some(range) => { - let len = range.end - range.start; - let bytes = self - .store - .get_range(&key, range.start, len) - .await - .map_err(|e| std::io::Error::other(e.to_string()))?; - Ok(Bytes::from(bytes)) - } - None => { - let bytes = self - .store - .get(&key) - .await - .map_err(|e| std::io::Error::other(e.to_string()))?; - Ok(Bytes::from(bytes)) - } - } - } - - #[inline(never)] - #[fastrace::trace] - async fn write(&self, entry_id: &EntryID, data: Bytes) -> Result<(), std::io::Error> { - let key = entry_id_to_key(entry_id); - self.store - .put(key, data.to_vec()) - .await - .map_err(|e| std::io::Error::other(e.to_string()))?; - Ok(()) - } } #[cfg(test)] @@ -144,38 +91,36 @@ mod tests { EntryID::from(usize::from(id)) } - fn make_ctx() -> ParquetIoContext { - let tmp = tempfile::tempdir().unwrap(); - let store = tokio_test::block_on(t4::mount(tmp.path().join("liquid_cache.t4"))).unwrap(); - ParquetIoContext::new(store) + fn make_meta() -> ParquetCacheMetadata { + ParquetCacheMetadata::new() } #[test] fn squeeze_hint_tracks_majority() { - let ctx = make_ctx(); + let meta = make_meta(); let e = entry(1, 2, 3); let month = Arc::new(CacheExpression::extract_date32(Date32Field::Month)); let year = Arc::new(CacheExpression::extract_date32(Date32Field::Year)); - ctx.add_squeeze_hint(&e, month.clone()); - ctx.add_squeeze_hint(&e, month.clone()); - ctx.add_squeeze_hint(&e, year.clone()); + meta.add_squeeze_hint(&e, month.clone()); + meta.add_squeeze_hint(&e, month.clone()); + meta.add_squeeze_hint(&e, year.clone()); - let majority = ctx.squeeze_hint(&e).expect("hint"); + let majority = meta.squeeze_hint(&e).expect("hint"); assert_eq!(majority, month); } #[test] fn squeeze_hint_prefers_recent_on_tie() { - let ctx = make_ctx(); + let meta = make_meta(); let e = entry(9, 9, 9); let year = Arc::new(CacheExpression::extract_date32(Date32Field::Year)); let day = Arc::new(CacheExpression::extract_date32(Date32Field::Day)); - ctx.add_squeeze_hint(&e, year.clone()); - ctx.add_squeeze_hint(&e, day.clone()); + meta.add_squeeze_hint(&e, year.clone()); + meta.add_squeeze_hint(&e, day.clone()); - let majority = ctx.squeeze_hint(&e).expect("hint"); + let majority = meta.squeeze_hint(&e).expect("hint"); assert_eq!(majority, day); } } diff --git a/src/datafusion/src/lib.rs b/src/datafusion/src/lib.rs index 8db55316..2c631f69 100644 --- a/src/datafusion/src/lib.rs +++ b/src/datafusion/src/lib.rs @@ -6,7 +6,6 @@ pub mod optimizers; mod reader; mod sync; pub(crate) mod utils; -pub use liquid_cache::utils::VariantSchema; pub mod cache; pub use cache::{LiquidCacheParquet, LiquidCacheParquetRef}; diff --git a/src/datafusion/src/optimizers/lineage_opt.rs b/src/datafusion/src/optimizers/lineage_opt.rs index 58ff79c0..e8a3190b 100644 --- a/src/datafusion/src/optimizers/lineage_opt.rs +++ b/src/datafusion/src/optimizers/lineage_opt.rs @@ -403,9 +403,7 @@ impl LineageAnalyzer { let input_map = self.analyze_plan(alias.input.as_ref())?; let input_columns = alias.input.schema().columns(); let mut output = LineageMap::new(); - for (input_column, output_column) in - input_columns.iter().zip(alias.schema.columns().into_iter()) - { + for (input_column, output_column) in input_columns.iter().zip(alias.schema.columns()) { let key = ColumnKey::from_column(&output_column); let usages = input_map .get(&ColumnKey::from_column(input_column)) @@ -526,7 +524,7 @@ impl LineageAnalyzer { for (expr, column) in distinct_on .select_expr .iter() - .zip(distinct_on.schema.columns().into_iter()) + .zip(distinct_on.schema.columns()) { let usages = lineage_for_expr(expr, &input_map, schema.as_ref())?; output.insert(ColumnKey::from_column(&column), usages); @@ -1068,9 +1066,7 @@ fn part_to_unit(expr: &Expr) -> Option { #[cfg(test)] mod tests { - use crate::optimizers::{ - DATE_MAPPING_METADATA_KEY, LocalModeOptimizer, VARIANT_MAPPING_METADATA_KEY, - }; + use crate::optimizers::{DATE_MAPPING_METADATA_KEY, LocalModeOptimizer}; use crate::{LiquidCacheParquet, VariantGetUdf, VariantToJsonUdf}; use liquid_cache::cache::AlwaysHydrate; @@ -1087,7 +1083,6 @@ mod tests { use liquid_cache::cache_policies::LiquidPolicy; use parquet::arrow::ArrowWriter; use parquet::variant::{VariantArray, json_to_variant}; - use serde::Deserialize; use tempfile::TempDir; // ───────────────────────────────────────────────────────────────────────────── @@ -1103,6 +1098,7 @@ mod tests { LiquidCacheParquet::new( 1024, 1024 * 1024 * 1024, + usize::MAX, store, Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), @@ -1312,35 +1308,12 @@ mod tests { field_metadata_map } - #[derive(Debug, Deserialize)] - struct VariantMetadataEntry { - path: String, - #[serde(rename = "type")] - data_type: Option, - } - - fn parse_variant_metadata(value: &str) -> Vec { - serde_json::from_str(value).unwrap_or_else(|_| { - vec![VariantMetadataEntry { - path: value.to_string(), - data_type: None, - }] - }) - } - - fn variant_paths_from_metadata(value: &str) -> Vec { - parse_variant_metadata(value) - .into_iter() - .map(|entry| entry.path) - .collect() - } - - /// Assert metadata on physical plan matches expected date and variant extractions + /// Assert metadata on physical plan matches expected date extractions. async fn assert_metadata( ctx: &SessionContext, sql: &str, expected_date: Vec<(&str, &str)>, - expected_variant: Vec<&str>, + _expected_variant: Vec<&str>, ) { let df = ctx.sql(sql).await.unwrap(); let (state, plan) = df.into_parts(); @@ -1348,9 +1321,7 @@ mod tests { let physical_plan = state.create_physical_plan(&optimized).await.unwrap(); let date_metadata = extract_field_metadata(&physical_plan, DATE_MAPPING_METADATA_KEY); - let variant_metadata = extract_field_metadata(&physical_plan, VARIANT_MAPPING_METADATA_KEY); - // Check date metadata let expected_date_map: HashMap = expected_date .into_iter() .map(|(col, val)| (col.to_string(), val.to_string())) @@ -1360,31 +1331,6 @@ mod tests { "date metadata mismatch for SQL: {}", sql ); - - // Check variant metadata - if expected_variant.is_empty() { - assert!( - !variant_metadata.contains_key("data"), - "variant metadata should not be present for SQL: {}", - sql - ); - } else { - let mut actual = variant_metadata - .get("data") - .map(|v| variant_paths_from_metadata(v)) - .unwrap_or_default(); - actual.sort(); - let mut expected: Vec = expected_variant - .into_iter() - .map(|s| s.to_string()) - .collect(); - expected.sort(); - assert_eq!( - actual, expected, - "variant metadata mismatch for SQL: {}", - sql - ); - } } // ───────────────────────────────────────────────────────────────────────────── @@ -1739,7 +1685,7 @@ mod tests { // ───────────────────────────────────────────────────────────────────────────── #[tokio::test] - async fn variant_get_type_hint_propagated() { + async fn variant_get_type_hint_does_not_emit_metadata() { let (_dir, ctx, _) = setup_variant_table().await; let df = ctx @@ -1750,17 +1696,11 @@ mod tests { let optimized = state.optimize(&plan).unwrap(); let physical_plan = state.create_physical_plan(&optimized).await.unwrap(); - let metadata = extract_field_metadata(&physical_plan, VARIANT_MAPPING_METADATA_KEY); - - let entries = metadata - .get("data") - .map(|value| parse_variant_metadata(value)) - .unwrap_or_default(); - let entry = entries - .iter() - .find(|entry| entry.path == "name") - .expect("variant metadata entry for name"); - assert_eq!(entry.data_type.as_deref(), Some("Utf8")); + let metadata = extract_field_metadata(&physical_plan, DATE_MAPPING_METADATA_KEY); + assert!( + !metadata.contains_key("data"), + "variant_get should not emit cache metadata" + ); } #[tokio::test] diff --git a/src/datafusion/src/optimizers/mod.rs b/src/datafusion/src/optimizers/mod.rs index cc1950f8..b2490580 100644 --- a/src/datafusion/src/optimizers/mod.rs +++ b/src/datafusion/src/optimizers/mod.rs @@ -2,9 +2,9 @@ mod lineage_opt; -use std::{str::FromStr, sync::Arc}; +use std::sync::Arc; -use arrow_schema::{DataType, Field, Fields, Schema, SchemaRef}; +use arrow_schema::{Field, Schema, SchemaRef}; use datafusion::{ catalog::memory::DataSourceExec, common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}, @@ -19,79 +19,15 @@ use datafusion::{ physical_plan::ExecutionPlan, }; pub use lineage_opt::LineageOptimizer; -pub(crate) use lineage_opt::VariantField; use crate::{ LiquidCacheParquetRef, LiquidParquetSource, optimizers::lineage_opt::{ColumnAnnotation, metadata_from_factory, serialize_date_part}, }; -use liquid_cache::utils::VariantSchema; -use serde::{Deserialize, Serialize}; pub(crate) const DATE_MAPPING_METADATA_KEY: &str = "liquid.cache.date_mapping"; -pub(crate) const VARIANT_MAPPING_METADATA_KEY: &str = "liquid.cache.variant_path"; -pub(crate) const VARIANT_MAPPING_TYPE_METADATA_KEY: &str = "liquid.cache.variant_type"; pub(crate) const STRING_FINGERPRINT_METADATA_KEY: &str = "liquid.cache.string_fingerprint"; -#[derive(Debug, Clone, Serialize, Deserialize)] -struct VariantMappingSerdeEntry { - path: String, - #[serde(rename = "type", skip_serializing_if = "Option::is_none")] - data_type: Option, -} - -pub(crate) fn serialize_variant_mappings(fields: &[VariantField]) -> Option { - if fields.is_empty() { - return None; - } - - let entries: Vec = fields - .iter() - .map(|field| VariantMappingSerdeEntry { - path: field.path.clone(), - data_type: field - .data_type - .as_ref() - .map(|data_type| data_type.to_string()), - }) - .collect(); - - serde_json::to_string(&entries).ok() -} - -fn deserialize_variant_mappings(raw: &str) -> Option> { - let entries: Vec = serde_json::from_str(raw).ok()?; - let mut fields = Vec::with_capacity(entries.len()); - for entry in entries { - let data_type = match entry.data_type { - Some(spec) => Some(DataType::from_str(&spec).ok()?), - None => None, - }; - fields.push(VariantField { - path: entry.path, - data_type, - }); - } - Some(fields) -} - -pub(crate) fn variant_mappings_from_field(field: &Field) -> Option> { - let metadata = field.metadata(); - let raw = metadata.get(VARIANT_MAPPING_METADATA_KEY)?; - if let Some(parsed) = deserialize_variant_mappings(raw) { - return Some(parsed); - } - - let data_type = metadata - .get(VARIANT_MAPPING_TYPE_METADATA_KEY) - .and_then(|spec| DataType::from_str(spec).ok()); - - Some(vec![VariantField { - path: raw.clone(), - data_type, - }]) -} - /// Physical optimizer rule for local mode liquid cache /// /// This optimizer rewrites DataSourceExec nodes that read Parquet files @@ -99,24 +35,17 @@ pub(crate) fn variant_mappings_from_field(field: &Field) -> Option Self { - Self { - cache, - eager_shredding, - } + pub fn new(cache: LiquidCacheParquetRef) -> Self { + Self { cache } } /// Create an optimizer with an existing cache instance pub fn with_cache(cache: LiquidCacheParquetRef) -> Self { - Self { - cache, - eager_shredding: true, - } + Self { cache } } } @@ -126,11 +55,7 @@ impl PhysicalOptimizerRule for LocalModeOptimizer { plan: Arc, _config: &ConfigOptions, ) -> Result, datafusion::error::DataFusionError> { - Ok(rewrite_data_source_plan( - plan, - &self.cache, - self.eager_shredding, - )) + Ok(rewrite_data_source_plan(plan, &self.cache)) } fn name(&self) -> &str { @@ -148,10 +73,9 @@ impl PhysicalOptimizerRule for LocalModeOptimizer { pub fn rewrite_data_source_plan( plan: Arc, cache: &LiquidCacheParquetRef, - eager_shredding: bool, ) -> Arc { let rewritten = plan - .transform_up(|node| try_optimize_parquet_source(node, cache, eager_shredding)) + .transform_up(|node| try_optimize_parquet_source(node, cache)) .unwrap(); rewritten.data } @@ -159,7 +83,6 @@ pub fn rewrite_data_source_plan( fn try_optimize_parquet_source( plan: Arc, cache: &LiquidCacheParquetRef, - eager_shredding: bool, ) -> Result>, datafusion::error::DataFusionError> { let any_plan = plan.as_any(); if let Some(data_source_exec) = any_plan.downcast_ref::() @@ -171,11 +94,8 @@ fn try_optimize_parquet_source( let mut new_source = LiquidParquetSource::from_parquet_source(parquet_source.clone(), cache.clone()); if let Some(expr_adapter_factory) = file_scan_config.expr_adapter_factory.as_ref() { - let new_schema = enrich_source_schema( - file_scan_config.file_schema(), - expr_adapter_factory, - eager_shredding, - ); + let new_schema = + enrich_source_schema(file_scan_config.file_schema(), expr_adapter_factory); let table_partition_cols = new_source.table_schema().table_partition_cols(); let new_table_schema = TableSchema::new(Arc::new(new_schema), table_partition_cols.clone()); @@ -198,12 +118,11 @@ fn try_optimize_parquet_source( fn enrich_source_schema( file_schema: &SchemaRef, expr_adapter_factory: &Arc, - eager_shredding: bool, ) -> Schema { let mut new_fields = vec![]; for field in file_schema.fields() { if let Some(annotation) = metadata_from_factory(expr_adapter_factory, field.name()) { - new_fields.push(process_field_annotation(field, annotation, eager_shredding)); + new_fields.push(process_field_annotation(field, annotation)); } else { new_fields.push(field.clone()); } @@ -211,13 +130,8 @@ fn enrich_source_schema( Schema::new(new_fields) } -fn process_field_annotation( - field: &Arc, - annotation: ColumnAnnotation, - eager_shredding: bool, -) -> Arc { +fn process_field_annotation(field: &Arc, annotation: ColumnAnnotation) -> Arc { let mut field_metadata = field.metadata().clone(); - let mut updated_field = Field::clone(field.as_ref()); match annotation { ColumnAnnotation::DatePart(unit) => { field_metadata.insert( @@ -225,14 +139,7 @@ fn process_field_annotation( serialize_date_part(&unit), ); } - ColumnAnnotation::VariantPaths(paths) => { - if eager_shredding { - if let Some(serialized) = serialize_variant_mappings(&paths) { - field_metadata.insert(VARIANT_MAPPING_METADATA_KEY.to_string(), serialized); - } - updated_field = enrich_variant_field_type(&updated_field, &paths); - } - } + ColumnAnnotation::VariantPaths(_) => {} ColumnAnnotation::SubstringSearch => { field_metadata.insert( STRING_FINGERPRINT_METADATA_KEY.to_string(), @@ -240,78 +147,7 @@ fn process_field_annotation( ); } } - Arc::new(updated_field.with_metadata(field_metadata)) -} - -pub(crate) fn enrich_variant_field_type(field: &Field, fields: &[VariantField]) -> Field { - let typed_specs: Vec<&VariantField> = fields - .iter() - .filter(|field| field.data_type.is_some()) - .collect(); - if typed_specs.is_empty() { - return Field::clone(field); - } - - let new_type = match field.data_type() { - DataType::Struct(children) => { - let mut rewritten = Vec::with_capacity(children.len() + 1); - let mut replaced = false; - for child in children.iter() { - if child.name() == "typed_value" { - rewritten.push(build_variant_typed_value_field( - Some(child.as_ref()), - &typed_specs, - )); - replaced = true; - } else { - let mut child_field = child.as_ref().clone(); - if child_field.name() == "value" { - child_field = - Field::new(child_field.name(), child_field.data_type().clone(), true) - .with_metadata(child_field.metadata().clone()); - } - rewritten.push(Arc::new(child_field)); - } - } - if !replaced { - rewritten.push(build_variant_typed_value_field(None, &typed_specs)); - } - DataType::Struct(Fields::from(rewritten)) - } - other => other.clone(), - }; - Field::clone(field).with_data_type(new_type) -} - -pub(crate) fn enrich_schema_for_cache(schema: &SchemaRef) -> SchemaRef { - let mut fields = vec![]; - for field in schema.fields() { - let new_field = if let Some(mappings) = variant_mappings_from_field(field.as_ref()) { - Arc::new(enrich_variant_field_type(field.as_ref(), &mappings)) - } else { - field.clone() - }; - fields.push(new_field); - } - Arc::new(Schema::new(fields)) -} - -fn build_variant_typed_value_field( - existing: Option<&Field>, - specs: &[&VariantField], -) -> Arc { - let mut schema = VariantSchema::new(existing); - for spec in specs { - if let Some(data_type) = spec.data_type.as_ref() { - schema.insert_path(&spec.path, data_type); - } - } - - Arc::new(Field::new( - "typed_value", - DataType::Struct(Fields::from(schema.typed_fields())), - true, - )) + Arc::new(Field::clone(field.as_ref()).with_metadata(field_metadata)) } #[cfg(test)] @@ -336,6 +172,7 @@ mod tests { LiquidCacheParquet::new( 8192, 1000000, + usize::MAX, store, Box::new(LiquidPolicy::new()), Box::new(TranscodeSqueezeEvict), @@ -343,7 +180,7 @@ mod tests { ) .await, ); - let rewritten = rewrite_data_source_plan(plan, &liquid_cache, true); + let rewritten = rewrite_data_source_plan(plan, &liquid_cache); rewritten .apply(|node| { diff --git a/src/datafusion/src/reader/plantime/mod.rs b/src/datafusion/src/reader/plantime/mod.rs index 17a2498a..b52a9bcf 100644 --- a/src/datafusion/src/reader/plantime/mod.rs +++ b/src/datafusion/src/reader/plantime/mod.rs @@ -1,3 +1,5 @@ +#[cfg(test)] +pub(crate) use source::CachedMetaReaderFactory; pub use source::LiquidParquetSource; pub(crate) use source::ParquetMetadataCacheReader; diff --git a/src/datafusion/src/reader/plantime/opener.rs b/src/datafusion/src/reader/plantime/opener.rs index 0bf5b7c1..7abe216e 100644 --- a/src/datafusion/src/reader/plantime/opener.rs +++ b/src/datafusion/src/reader/plantime/opener.rs @@ -1,441 +1,440 @@ -use std::sync::Arc; - +use std::sync::Arc; + use crate::{ cache::LiquidCacheParquetRef, - optimizers::enrich_schema_for_cache, reader::{ plantime::{row_filter::build_row_filter, row_group_filter::RowGroupAccessPlanFilter}, runtime::LiquidStreamBuilder, - }, -}; -use ahash::AHashMap; -use arrow::array::{RecordBatch, RecordBatchOptions}; -use arrow_schema::{Field, Schema, SchemaRef}; -use datafusion::{ - common::exec_err, - datasource::{ - listing::PartitionedFile, - physical_plan::{ - FileOpenFuture, FileOpener, ParquetFileMetrics, - parquet::{PagePruningAccessPlanFilter, ParquetAccessPlan}, - }, - table_schema::TableSchema, - }, - error::DataFusionError, - physical_expr::PhysicalExprSimplifier, - physical_expr::projection::ProjectionExprs, - physical_expr::utils::reassign_expr_columns, - physical_expr_adapter::{PhysicalExprAdapterFactory, replace_columns_with_literals}, - physical_expr_common::physical_expr::is_dynamic_physical_expr, - physical_optimizer::pruning::{FilePruner, PruningPredicate, build_pruning_predicate}, - physical_plan::{ - PhysicalExpr, - metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder}, - }, -}; -use futures::StreamExt; -use futures::TryStreamExt; -use log::debug; -use parquet::arrow::{ - ParquetRecordBatchStreamBuilder, ProjectionMask, - arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}, -}; -use parquet::file::metadata::ParquetMetaData; - -use super::source::CachedMetaReaderFactory; - -pub struct LiquidParquetOpener { - partition_index: usize, - projection: ProjectionExprs, - batch_size: usize, - limit: Option, - predicate: Option>, - table_schema: TableSchema, - metrics: ExecutionPlanMetricsSet, - parquet_file_reader_factory: Arc, - reorder_filters: bool, - liquid_cache: LiquidCacheParquetRef, - expr_adapter_factory: Arc, - span: Option>, -} - -impl LiquidParquetOpener { - #[allow(clippy::too_many_arguments)] - pub fn new( - partition_index: usize, - projection: ProjectionExprs, - batch_size: usize, - limit: Option, - predicate: Option>, - table_schema: TableSchema, - metrics: ExecutionPlanMetricsSet, - liquid_cache: LiquidCacheParquetRef, - parquet_file_reader_factory: Arc, - reorder_filters: bool, - expr_adapter_factory: Arc, - span: Option>, - ) -> Self { - Self { - partition_index, - projection, - batch_size, - limit, - predicate, - table_schema, - metrics, - liquid_cache, - parquet_file_reader_factory, - reorder_filters, - expr_adapter_factory, - span, - } - } -} - -// transfer lineage metadata from tagged schema to dst schema -// The two schema must from the same file. -fn transfer_lineage_metadata_to_file_schema( - tagged_schema: SchemaRef, - dst_schema: SchemaRef, -) -> Schema { - let mut new_fields = vec![]; - - let mut tagged_fields = AHashMap::new(); - for field in tagged_schema.fields().iter() { - tagged_fields.insert(field.name().to_string(), field.clone()); - } - for field in dst_schema.fields().iter() { - let tagged_field = match tagged_fields.get(field.name()) { - Some(tagged_field) => { - let new_field = Field::clone(field).with_metadata(tagged_field.metadata().clone()); - Arc::new(new_field) - } - None => field.clone(), - }; - new_fields.push(tagged_field); - } - let dst_metadata = dst_schema.metadata().clone(); - Schema::new(new_fields).with_metadata(dst_metadata) -} - -impl FileOpener for LiquidParquetOpener { - fn open(&self, partitioned_file: PartitionedFile) -> Result { - let file_range = partitioned_file.range.clone(); - let extensions = partitioned_file.extensions.clone(); - let file_name = partitioned_file.object_meta.location.to_string(); - let file_metrics = ParquetFileMetrics::new(self.partition_index, &file_name, &self.metrics); - - let metadata_size_hint = partitioned_file.metadata_size_hint; - - let lc = self.liquid_cache.clone(); - let file_loc = partitioned_file.object_meta.location.to_string(); - - let mut async_file_reader = self.parquet_file_reader_factory.create_liquid_reader( - self.partition_index, - partitioned_file.clone(), - metadata_size_hint, - &self.metrics, - ); - - let batch_size = self.batch_size; - let logical_file_schema = Arc::clone(self.table_schema.file_schema()); - let output_schema = Arc::new( - self.projection - .project_schema(self.table_schema.table_schema())?, - ); - let mut projection = self.projection.clone(); - let mut predicate = self.predicate.clone(); - let mut literal_columns = std::collections::HashMap::new(); - for (field, value) in self - .table_schema - .table_partition_cols() - .iter() - .zip(partitioned_file.partition_values.iter()) - { - literal_columns.insert(field.name().clone(), value.clone()); - } - if !literal_columns.is_empty() { - projection = projection.try_map_exprs(|expr| { - replace_columns_with_literals(Arc::clone(&expr), &literal_columns) - })?; - predicate = predicate - .map(|p| replace_columns_with_literals(p, &literal_columns)) - .transpose()?; - } - let reorder_predicates = self.reorder_filters; - let limit = self.limit; - - let predicate_creation_errors = - MetricBuilder::new(&self.metrics).global_counter("num_predicate_creation_errors"); - - let expr_adapter_factory = Arc::clone(&self.expr_adapter_factory); - let span = self.span.clone(); - Ok(Box::pin(async move { - // Prune this file using the file level statistics and partition values. - // Since dynamic filters may have been updated since planning it is possible that we are able - // to prune files now that we couldn't prune at planning time. - // It is assumed that there is no point in doing pruning here if the predicate is not dynamic, - // as it would have been done at planning time. - // We'll also check this after every record batch we read, - // and if at some point we are able to prove we can prune the file using just the file level statistics - // we can end the stream early. - let mut file_pruner = predicate - .as_ref() - .filter(|p| is_dynamic_physical_expr(p) || partitioned_file.has_statistics()) - .and_then(|p| { - FilePruner::try_new( - Arc::clone(p), - &logical_file_schema, - &partitioned_file, - predicate_creation_errors.clone(), - ) - }); - - if let Some(file_pruner) = &mut file_pruner - && file_pruner.should_prune()? - { - file_metrics.files_ranges_pruned_statistics.add_pruned(1); - return Ok(futures::stream::empty().boxed()); - } - - file_metrics.files_ranges_pruned_statistics.add_matched(1); - - let mut options = ArrowReaderOptions::new() - .with_page_index_policy(parquet::file::metadata::PageIndexPolicy::Required); - let mut metadata_timer = file_metrics.metadata_load_time.timer(); - - // Begin by loading the metadata from the underlying reader (note - // the returned metadata may actually include page indexes as some - // readers may return page indexes even when not requested -- for - // example when they are cached) - let mut reader_metadata = - ArrowReaderMetadata::load_async(&mut async_file_reader, options.clone()).await?; - - // Note about schemas: we are actually dealing with **3 different schemas** here: - // - The table schema as defined by the TableProvider. - // This is what the user sees, what they get when they `SELECT * FROM table`, etc. - // - The logical file schema: this is the table schema minus any hive partition columns and projections. - // This is what the physical file schema is coerced to. - // - The physical file schema: this is the schema as defined by the parquet file. This is what the parquet file actually contains. - let physical_file_schema = Arc::clone(reader_metadata.schema()); - let physical_file_schema = Arc::new(transfer_lineage_metadata_to_file_schema( - Arc::clone(&logical_file_schema), - Arc::clone(&physical_file_schema), - )); - let cache_full_schema = enrich_schema_for_cache(&physical_file_schema); - options = options.with_schema(Arc::clone(&physical_file_schema)); - reader_metadata = - ArrowReaderMetadata::try_new(Arc::clone(reader_metadata.metadata()), options)?; - debug_assert!( - Arc::strong_count(reader_metadata.metadata()) > 1, - "meta data must be cached already" - ); - - let rewriter = expr_adapter_factory.create( - Arc::clone(&logical_file_schema), - Arc::clone(&physical_file_schema), - )?; - let simplifier = PhysicalExprSimplifier::new(&physical_file_schema); - predicate = predicate - .map(|p| simplifier.simplify(rewriter.rewrite(p)?)) - .transpose()?; - projection = projection.try_map_exprs(|p| simplifier.simplify(rewriter.rewrite(p)?))?; - - let (pruning_predicate, page_pruning_predicate) = build_pruning_predicates( - predicate.as_ref(), - &physical_file_schema, - &predicate_creation_errors, - ); - - metadata_timer.stop(); - - let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata( - async_file_reader.clone(), - reader_metadata.clone(), - ); - let indices = projection.column_indices(); - let mask = ProjectionMask::roots(builder.parquet_schema(), indices); - - // Filter pushdown: evaluate predicates during scan - let row_filter = predicate.as_ref().and_then(|p| { - let row_filter = build_row_filter( - p, - &physical_file_schema, - reader_metadata.metadata(), - reorder_predicates, - &file_metrics, - ); - - match row_filter { - Ok(Some(filter)) => Some(filter), - Ok(None) => None, - Err(e) => { - debug!("Ignoring error building row filter for '{predicate:?}': {e:?}"); - None - } - } - }); - - // Determine which row groups to actually read. The idea is to skip - // as many row groups as possible based on the metadata and query - let file_metadata: Arc = Arc::clone(builder.metadata()); - let predicate = pruning_predicate.as_ref().map(|p| p.as_ref()); - let rg_metadata = file_metadata.row_groups(); - // track which row groups to actually read - let access_plan = create_initial_plan(&file_name, extensions, rg_metadata.len())?; - let mut row_groups = RowGroupAccessPlanFilter::new(access_plan); - // if there is a range restricting what parts of the file to read - if let Some(range) = file_range.as_ref() { - row_groups.prune_by_range(rg_metadata, range); - } - // If there is a predicate that can be evaluated against the metadata - if let Some(predicate) = predicate.as_ref() { - row_groups.prune_by_statistics( - &physical_file_schema, - builder.parquet_schema(), - rg_metadata, - predicate, - &file_metrics, - ); - - if !row_groups.is_empty() { - row_groups - .prune_by_bloom_filters( - &physical_file_schema, - &mut builder, - predicate, - &file_metrics, - ) - .await; - } - } - - let mut access_plan = row_groups.build(); - - // page index pruning: if all data on individual pages can - // be ruled using page metadata, rows from other columns - // with that range can be skipped as well - if !access_plan.is_empty() - && let Some(p) = page_pruning_predicate - { - access_plan = p.prune_plan_with_page_index( - access_plan, - &physical_file_schema, - builder.parquet_schema(), - file_metadata.as_ref(), - &file_metrics, - ); - } - - let row_group_indexes = access_plan.row_group_indexes(); - let row_selection = access_plan.into_overall_row_selection(rg_metadata)?; - - let mut liquid_builder = - LiquidStreamBuilder::new(async_file_reader, Arc::clone(reader_metadata.metadata())) - .with_batch_size(batch_size) - .with_row_groups(row_group_indexes) - .with_projection(mask) - .with_selection(row_selection) - .with_limit(limit); - - if let Some(row_filter) = row_filter { - liquid_builder = liquid_builder.with_row_filter(row_filter); - } - - if let Some(s) = &span { - let span = fastrace::Span::enter_with_parent("liquid_stream", s); - liquid_builder = liquid_builder.with_span(span); - } - - let liquid_cache = lc.register_or_get_file(file_loc, Arc::clone(&cache_full_schema)); - - let stream = liquid_builder.build(liquid_cache)?; - - let stream_schema = Arc::clone(stream.schema()); - let replace_schema = !stream_schema.eq(&output_schema); - let projection = - projection.try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; - let projector = projection.make_projector(&stream_schema)?; - - let adapted = stream - .map_err(|e| DataFusionError::External(Box::new(e))) - .map(move |batch| { - batch.and_then(|batch| { - let batch = projector.project_batch(&batch)?; - if replace_schema { - let (_schema, arrays, num_rows) = batch.into_parts(); - let options = RecordBatchOptions::new().with_row_count(Some(num_rows)); - RecordBatch::try_new_with_options( - Arc::clone(&output_schema), - arrays, - &options, - ) - .map_err(Into::into) - } else { - Ok(batch) - } - }) - }); - - Ok(adapted.boxed()) - })) - } -} - -fn create_initial_plan( - file_name: &str, - extensions: Option>, - row_group_count: usize, -) -> Result { - if let Some(extensions) = extensions { - if let Some(access_plan) = extensions.downcast_ref::() { - let plan_len = access_plan.len(); - if plan_len != row_group_count { - return exec_err!( - "Invalid ParquetAccessPlan for {file_name}. Specified {plan_len} row groups, but file has {row_group_count}" - ); - } - - // check row group count matches the plan - return Ok(access_plan.clone()); - } else { - debug!("ParquetExec Ignoring unknown extension specified for {file_name}"); - } - } - - // default to scanning all row groups - Ok(ParquetAccessPlan::new_all(row_group_count)) -} - -pub(crate) fn build_pruning_predicates( - predicate: Option<&Arc>, - file_schema: &SchemaRef, - predicate_creation_errors: &Count, -) -> ( - Option>, - Option>, -) { - let Some(predicate) = predicate.as_ref() else { - return (None, None); - }; - let pruning_predicate = build_pruning_predicate( - Arc::clone(predicate), - file_schema, - predicate_creation_errors, - ); - let page_pruning_predicate = build_page_pruning_predicate(predicate, file_schema); - (pruning_predicate, Some(page_pruning_predicate)) -} - -/// Build a page pruning predicate from an optional predicate expression. -/// If the predicate is None or the predicate cannot be converted to a page pruning -/// predicate, return None. -pub(crate) fn build_page_pruning_predicate( - predicate: &Arc, - file_schema: &SchemaRef, -) -> Arc { - Arc::new(PagePruningAccessPlanFilter::new( - predicate, - Arc::clone(file_schema), - )) -} + }, +}; +use ahash::AHashMap; +use arrow::array::{RecordBatch, RecordBatchOptions}; +use arrow_schema::{Field, Schema, SchemaRef}; +use datafusion::{ + common::exec_err, + datasource::{ + listing::PartitionedFile, + physical_plan::{ + FileOpenFuture, FileOpener, ParquetFileMetrics, + parquet::{PagePruningAccessPlanFilter, ParquetAccessPlan}, + }, + table_schema::TableSchema, + }, + error::DataFusionError, + physical_expr::PhysicalExprSimplifier, + physical_expr::projection::ProjectionExprs, + physical_expr::utils::reassign_expr_columns, + physical_expr_adapter::{PhysicalExprAdapterFactory, replace_columns_with_literals}, + physical_expr_common::physical_expr::is_dynamic_physical_expr, + physical_optimizer::pruning::{FilePruner, PruningPredicate, build_pruning_predicate}, + physical_plan::{ + PhysicalExpr, + metrics::{Count, ExecutionPlanMetricsSet, MetricBuilder}, + }, +}; +use futures::StreamExt; +use futures::TryStreamExt; +use log::debug; +use parquet::arrow::{ + ParquetRecordBatchStreamBuilder, ProjectionMask, + arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}, +}; +use parquet::file::metadata::ParquetMetaData; + +use super::source::CachedMetaReaderFactory; + +pub struct LiquidParquetOpener { + partition_index: usize, + projection: ProjectionExprs, + batch_size: usize, + limit: Option, + predicate: Option>, + table_schema: TableSchema, + metrics: ExecutionPlanMetricsSet, + parquet_file_reader_factory: Arc, + reorder_filters: bool, + liquid_cache: LiquidCacheParquetRef, + expr_adapter_factory: Arc, + span: Option>, +} + +impl LiquidParquetOpener { + #[allow(clippy::too_many_arguments)] + pub fn new( + partition_index: usize, + projection: ProjectionExprs, + batch_size: usize, + limit: Option, + predicate: Option>, + table_schema: TableSchema, + metrics: ExecutionPlanMetricsSet, + liquid_cache: LiquidCacheParquetRef, + parquet_file_reader_factory: Arc, + reorder_filters: bool, + expr_adapter_factory: Arc, + span: Option>, + ) -> Self { + Self { + partition_index, + projection, + batch_size, + limit, + predicate, + table_schema, + metrics, + liquid_cache, + parquet_file_reader_factory, + reorder_filters, + expr_adapter_factory, + span, + } + } +} + +// transfer lineage metadata from tagged schema to dst schema +// The two schema must from the same file. +fn transfer_lineage_metadata_to_file_schema( + tagged_schema: SchemaRef, + dst_schema: SchemaRef, +) -> Schema { + let mut new_fields = vec![]; + + let mut tagged_fields = AHashMap::new(); + for field in tagged_schema.fields().iter() { + tagged_fields.insert(field.name().to_string(), field.clone()); + } + for field in dst_schema.fields().iter() { + let tagged_field = match tagged_fields.get(field.name()) { + Some(tagged_field) => { + let new_field = Field::clone(field).with_metadata(tagged_field.metadata().clone()); + Arc::new(new_field) + } + None => field.clone(), + }; + new_fields.push(tagged_field); + } + let dst_metadata = dst_schema.metadata().clone(); + Schema::new(new_fields).with_metadata(dst_metadata) +} + +impl FileOpener for LiquidParquetOpener { + fn open(&self, partitioned_file: PartitionedFile) -> Result { + let file_range = partitioned_file.range.clone(); + let extensions = partitioned_file.extensions.clone(); + let file_name = partitioned_file.object_meta.location.to_string(); + let file_metrics = ParquetFileMetrics::new(self.partition_index, &file_name, &self.metrics); + + let metadata_size_hint = partitioned_file.metadata_size_hint; + + let lc = self.liquid_cache.clone(); + let file_loc = partitioned_file.object_meta.location.to_string(); + + let mut async_file_reader = self.parquet_file_reader_factory.create_liquid_reader( + self.partition_index, + partitioned_file.clone(), + metadata_size_hint, + &self.metrics, + ); + + let batch_size = self.batch_size; + let logical_file_schema = Arc::clone(self.table_schema.file_schema()); + let output_schema = Arc::new( + self.projection + .project_schema(self.table_schema.table_schema())?, + ); + let mut projection = self.projection.clone(); + let mut predicate = self.predicate.clone(); + let mut literal_columns = std::collections::HashMap::new(); + for (field, value) in self + .table_schema + .table_partition_cols() + .iter() + .zip(partitioned_file.partition_values.iter()) + { + literal_columns.insert(field.name().clone(), value.clone()); + } + if !literal_columns.is_empty() { + projection = projection.try_map_exprs(|expr| { + replace_columns_with_literals(Arc::clone(&expr), &literal_columns) + })?; + predicate = predicate + .map(|p| replace_columns_with_literals(p, &literal_columns)) + .transpose()?; + } + let reorder_predicates = self.reorder_filters; + let limit = self.limit; + + let predicate_creation_errors = + MetricBuilder::new(&self.metrics).global_counter("num_predicate_creation_errors"); + + let expr_adapter_factory = Arc::clone(&self.expr_adapter_factory); + let span = self.span.clone(); + Ok(Box::pin(async move { + // Prune this file using the file level statistics and partition values. + // Since dynamic filters may have been updated since planning it is possible that we are able + // to prune files now that we couldn't prune at planning time. + // It is assumed that there is no point in doing pruning here if the predicate is not dynamic, + // as it would have been done at planning time. + // We'll also check this after every record batch we read, + // and if at some point we are able to prove we can prune the file using just the file level statistics + // we can end the stream early. + let mut file_pruner = predicate + .as_ref() + .filter(|p| is_dynamic_physical_expr(p) || partitioned_file.has_statistics()) + .and_then(|p| { + FilePruner::try_new( + Arc::clone(p), + &logical_file_schema, + &partitioned_file, + predicate_creation_errors.clone(), + ) + }); + + if let Some(file_pruner) = &mut file_pruner + && file_pruner.should_prune()? + { + file_metrics.files_ranges_pruned_statistics.add_pruned(1); + return Ok(futures::stream::empty().boxed()); + } + + file_metrics.files_ranges_pruned_statistics.add_matched(1); + + let mut options = ArrowReaderOptions::new() + .with_page_index_policy(parquet::file::metadata::PageIndexPolicy::Required); + let mut metadata_timer = file_metrics.metadata_load_time.timer(); + + // Begin by loading the metadata from the underlying reader (note + // the returned metadata may actually include page indexes as some + // readers may return page indexes even when not requested -- for + // example when they are cached) + let mut reader_metadata = + ArrowReaderMetadata::load_async(&mut async_file_reader, options.clone()).await?; + + // Note about schemas: we are actually dealing with **3 different schemas** here: + // - The table schema as defined by the TableProvider. + // This is what the user sees, what they get when they `SELECT * FROM table`, etc. + // - The logical file schema: this is the table schema minus any hive partition columns and projections. + // This is what the physical file schema is coerced to. + // - The physical file schema: this is the schema as defined by the parquet file. This is what the parquet file actually contains. + let physical_file_schema = Arc::clone(reader_metadata.schema()); + let physical_file_schema = Arc::new(transfer_lineage_metadata_to_file_schema( + Arc::clone(&logical_file_schema), + Arc::clone(&physical_file_schema), + )); + let cache_full_schema = Arc::clone(&physical_file_schema); + options = options.with_schema(Arc::clone(&physical_file_schema)); + reader_metadata = + ArrowReaderMetadata::try_new(Arc::clone(reader_metadata.metadata()), options)?; + debug_assert!( + Arc::strong_count(reader_metadata.metadata()) > 1, + "meta data must be cached already" + ); + + let rewriter = expr_adapter_factory.create( + Arc::clone(&logical_file_schema), + Arc::clone(&physical_file_schema), + )?; + let simplifier = PhysicalExprSimplifier::new(&physical_file_schema); + predicate = predicate + .map(|p| simplifier.simplify(rewriter.rewrite(p)?)) + .transpose()?; + projection = projection.try_map_exprs(|p| simplifier.simplify(rewriter.rewrite(p)?))?; + + let (pruning_predicate, page_pruning_predicate) = build_pruning_predicates( + predicate.as_ref(), + &physical_file_schema, + &predicate_creation_errors, + ); + + metadata_timer.stop(); + + let mut builder = ParquetRecordBatchStreamBuilder::new_with_metadata( + async_file_reader.clone(), + reader_metadata.clone(), + ); + let indices = projection.column_indices(); + let mask = ProjectionMask::roots(builder.parquet_schema(), indices); + + // Filter pushdown: evaluate predicates during scan + let row_filter = predicate.as_ref().and_then(|p| { + let row_filter = build_row_filter( + p, + &physical_file_schema, + reader_metadata.metadata(), + reorder_predicates, + &file_metrics, + ); + + match row_filter { + Ok(Some(filter)) => Some(filter), + Ok(None) => None, + Err(e) => { + debug!("Ignoring error building row filter for '{predicate:?}': {e:?}"); + None + } + } + }); + + // Determine which row groups to actually read. The idea is to skip + // as many row groups as possible based on the metadata and query + let file_metadata: Arc = Arc::clone(builder.metadata()); + let predicate = pruning_predicate.as_ref().map(|p| p.as_ref()); + let rg_metadata = file_metadata.row_groups(); + // track which row groups to actually read + let access_plan = create_initial_plan(&file_name, extensions, rg_metadata.len())?; + let mut row_groups = RowGroupAccessPlanFilter::new(access_plan); + // if there is a range restricting what parts of the file to read + if let Some(range) = file_range.as_ref() { + row_groups.prune_by_range(rg_metadata, range); + } + // If there is a predicate that can be evaluated against the metadata + if let Some(predicate) = predicate.as_ref() { + row_groups.prune_by_statistics( + &physical_file_schema, + builder.parquet_schema(), + rg_metadata, + predicate, + &file_metrics, + ); + + if !row_groups.is_empty() { + row_groups + .prune_by_bloom_filters( + &physical_file_schema, + &mut builder, + predicate, + &file_metrics, + ) + .await; + } + } + + let mut access_plan = row_groups.build(); + + // page index pruning: if all data on individual pages can + // be ruled using page metadata, rows from other columns + // with that range can be skipped as well + if !access_plan.is_empty() + && let Some(p) = page_pruning_predicate + { + access_plan = p.prune_plan_with_page_index( + access_plan, + &physical_file_schema, + builder.parquet_schema(), + file_metadata.as_ref(), + &file_metrics, + ); + } + + let row_group_indexes = access_plan.row_group_indexes(); + let row_selection = access_plan.into_overall_row_selection(rg_metadata)?; + + let mut liquid_builder = + LiquidStreamBuilder::new(async_file_reader, Arc::clone(reader_metadata.metadata())) + .with_batch_size(batch_size) + .with_row_groups(row_group_indexes) + .with_projection(mask) + .with_selection(row_selection) + .with_limit(limit); + + if let Some(row_filter) = row_filter { + liquid_builder = liquid_builder.with_row_filter(row_filter); + } + + if let Some(s) = &span { + let span = fastrace::Span::enter_with_parent("liquid_stream", s); + liquid_builder = liquid_builder.with_span(span); + } + + let liquid_cache = lc.register_or_get_file(file_loc, Arc::clone(&cache_full_schema)); + + let stream = liquid_builder.build(liquid_cache)?; + + let stream_schema = Arc::clone(stream.schema()); + let replace_schema = !stream_schema.eq(&output_schema); + let projection = + projection.try_map_exprs(|expr| reassign_expr_columns(expr, &stream_schema))?; + let projector = projection.make_projector(&stream_schema)?; + + let adapted = stream + .map_err(|e| DataFusionError::External(Box::new(e))) + .map(move |batch| { + batch.and_then(|batch| { + let batch = projector.project_batch(&batch)?; + if replace_schema { + let (_schema, arrays, num_rows) = batch.into_parts(); + let options = RecordBatchOptions::new().with_row_count(Some(num_rows)); + RecordBatch::try_new_with_options( + Arc::clone(&output_schema), + arrays, + &options, + ) + .map_err(Into::into) + } else { + Ok(batch) + } + }) + }); + + Ok(adapted.boxed()) + })) + } +} + +fn create_initial_plan( + file_name: &str, + extensions: Option>, + row_group_count: usize, +) -> Result { + if let Some(extensions) = extensions { + if let Some(access_plan) = extensions.downcast_ref::() { + let plan_len = access_plan.len(); + if plan_len != row_group_count { + return exec_err!( + "Invalid ParquetAccessPlan for {file_name}. Specified {plan_len} row groups, but file has {row_group_count}" + ); + } + + // check row group count matches the plan + return Ok(access_plan.clone()); + } else { + debug!("ParquetExec Ignoring unknown extension specified for {file_name}"); + } + } + + // default to scanning all row groups + Ok(ParquetAccessPlan::new_all(row_group_count)) +} + +pub(crate) fn build_pruning_predicates( + predicate: Option<&Arc>, + file_schema: &SchemaRef, + predicate_creation_errors: &Count, +) -> ( + Option>, + Option>, +) { + let Some(predicate) = predicate.as_ref() else { + return (None, None); + }; + let pruning_predicate = build_pruning_predicate( + Arc::clone(predicate), + file_schema, + predicate_creation_errors, + ); + let page_pruning_predicate = build_page_pruning_predicate(predicate, file_schema); + (pruning_predicate, Some(page_pruning_predicate)) +} + +/// Build a page pruning predicate from an optional predicate expression. +/// If the predicate is None or the predicate cannot be converted to a page pruning +/// predicate, return None. +pub(crate) fn build_page_pruning_predicate( + predicate: &Arc, + file_schema: &SchemaRef, +) -> Arc { + Arc::new(PagePruningAccessPlanFilter::new( + predicate, + Arc::clone(file_schema), + )) +} diff --git a/src/datafusion/src/reader/runtime/liquid_cache_reader.rs b/src/datafusion/src/reader/runtime/liquid_cache_reader.rs index 87c123ec..4a5a6b27 100644 --- a/src/datafusion/src/reader/runtime/liquid_cache_reader.rs +++ b/src/datafusion/src/reader/runtime/liquid_cache_reader.rs @@ -3,16 +3,21 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use arrow::array::{Array, RecordBatch}; +use arrow::array::{Array, ArrayRef, BooleanArray, RecordBatch}; use arrow::buffer::BooleanBuffer; use arrow::compute::prep_null_mask_filter; use arrow::record_batch::RecordBatchOptions; -use arrow_schema::{ArrowError, SchemaRef}; -use futures::{Stream, future::BoxFuture}; -use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; - -use crate::cache::{BatchID, CachedRowGroupRef}; -use crate::reader::plantime::LiquidRowFilter; +use arrow_schema::{ArrowError, Schema, SchemaRef}; +use futures::{Stream, StreamExt, future::BoxFuture, stream::BoxStream}; +use parquet::arrow::arrow_reader::{ + ArrowPredicate, ArrowReaderMetadata, ArrowReaderOptions, RowSelection, RowSelector, +}; +use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; +use parquet::errors::ParquetError; +use parquet::file::metadata::ParquetMetaData; + +use crate::cache::{BatchID, CachedRowGroupRef, InsertArrowArrayError}; +use crate::reader::plantime::{LiquidRowFilter, ParquetMetadataCacheReader}; use crate::reader::runtime::utils::take_next_batch; use crate::utils::{boolean_buffer_and_then, row_selector_to_boolean_buffer}; @@ -22,7 +27,7 @@ pub(crate) struct LiquidCacheReader { } enum ReaderState { - Ready(LiquidCacheReaderInner), + Ready(Box), Processing( BoxFuture< 'static, @@ -48,27 +53,56 @@ struct LiquidCacheReaderInner { schema: SchemaRef, batch_size: usize, projection_columns: Vec, + parquet_fallback: ParquetFallback, + last_pull: Option<(BatchID, RecordBatch)>, +} + +pub(crate) struct LiquidCacheReaderConfig { + pub(crate) batch_size: usize, + pub(crate) selection: RowSelection, + pub(crate) row_filter: Option, + pub(crate) cached_row_group: CachedRowGroupRef, + pub(crate) projection_columns: Vec, + pub(crate) schema: SchemaRef, + pub(crate) parquet_fallback: ParquetFallbackConfig, +} + +#[derive(Clone)] +pub(crate) struct ParquetFallbackConfig { + pub(crate) row_group_idx: usize, + pub(crate) metadata: Arc, + pub(crate) input: ParquetMetadataCacheReader, + pub(crate) cache_projection: ProjectionMask, + pub(crate) cache_column_ids: Vec, + pub(crate) cache_batch_size: usize, + pub(crate) row_count: usize, +} + +struct ParquetFallback { + row_group_idx: usize, + metadata: Arc, + input: ParquetMetadataCacheReader, + cache_projection: ProjectionMask, + cache_column_ids: Vec, + cache_batch_size: usize, + row_count: usize, + stream: Option>>, + next_batch_id: BatchID, } impl LiquidCacheReader { - pub(crate) fn new( - batch_size: usize, - selection: RowSelection, - row_filter: Option, - cached_row_group: CachedRowGroupRef, - projection_columns: Vec, - schema: SchemaRef, - ) -> Self { + pub(crate) fn new(config: LiquidCacheReaderConfig) -> Self { let inner = LiquidCacheReaderInner::new( - batch_size, - selection, - cached_row_group, - projection_columns, - Arc::clone(&schema), + config.batch_size, + config.selection, + config.cached_row_group, + config.projection_columns, + Arc::clone(&config.schema), + ParquetFallback::new(config.parquet_fallback), ); Self { - state: ReaderState::Ready(inner), - row_filter, + state: ReaderState::Ready(Box::new(inner)), + row_filter: config.row_filter, } } @@ -96,7 +130,7 @@ impl Stream for LiquidCacheReader { } Poll::Ready((inner, row_filter, result)) => { self.row_filter = row_filter; - self.state = ReaderState::Ready(inner); + self.state = ReaderState::Ready(Box::new(inner)); match result { ProcessResult::Emit(item) => return Poll::Ready(Some(item)), ProcessResult::Skip => continue, @@ -106,6 +140,7 @@ impl Stream for LiquidCacheReader { ReaderState::Ready(mut inner) => { match take_next_batch(&mut inner.selection, inner.batch_size) { Some(selection) => { + let inner = *inner; let future = inner.next_batch(self.row_filter.take(), selection); self.state = ReaderState::Processing(future); continue; @@ -125,6 +160,86 @@ impl Stream for LiquidCacheReader { } } +impl ParquetFallback { + fn new(config: ParquetFallbackConfig) -> Self { + Self { + row_group_idx: config.row_group_idx, + metadata: config.metadata, + input: config.input, + cache_projection: config.cache_projection, + cache_column_ids: config.cache_column_ids, + cache_batch_size: config.cache_batch_size, + row_count: config.row_count, + stream: None, + next_batch_id: BatchID::from_raw(0), + } + } + + async fn fetch_batch(&mut self, batch_id: BatchID) -> Result { + if self.stream.is_none() || batch_id != self.next_batch_id { + self.rebuild_stream(batch_id)?; + } + + let stream = self.stream.as_mut().expect("fallback stream is present"); + let record_batch = stream.next().await.transpose()?.ok_or_else(|| { + ParquetError::General(format!( + "parquet fallback ended before batch {}", + *batch_id as usize + )) + })?; + + self.next_batch_id = batch_id; + self.next_batch_id.inc(); + Ok(record_batch) + } + + fn rebuild_stream(&mut self, batch_id: BatchID) -> Result<(), ParquetError> { + let reader_metadata = + ArrowReaderMetadata::try_new(Arc::clone(&self.metadata), ArrowReaderOptions::new())?; + let row_selection = + build_row_selection_from(batch_id, self.cache_batch_size, self.row_count); + + let stream = + ParquetRecordBatchStreamBuilder::new_with_metadata(self.input.clone(), reader_metadata) + .with_projection(self.cache_projection.clone()) + .with_row_groups(vec![self.row_group_idx]) + .with_batch_size(self.cache_batch_size) + .with_row_selection(row_selection) + .build()? + .boxed(); + + self.stream = Some(stream); + self.next_batch_id = batch_id; + Ok(()) + } +} + +fn build_row_selection_from( + batch_id: BatchID, + batch_size: usize, + row_count: usize, +) -> RowSelection { + let start = usize::from(*batch_id) * batch_size; + let mut selectors = Vec::new(); + + if start > 0 { + selectors.push(RowSelector::skip(start.min(row_count))); + } + + if start >= row_count { + return RowSelection::from(selectors); + } + + let mut remaining = row_count - start; + while remaining > 0 { + let selected = remaining.min(batch_size); + selectors.push(RowSelector::select(selected)); + remaining -= selected; + } + + RowSelection::from(selectors) +} + impl LiquidCacheReaderInner { fn new( batch_size: usize, @@ -132,6 +247,7 @@ impl LiquidCacheReaderInner { cached_row_group: CachedRowGroupRef, projection_columns: Vec, schema: SchemaRef, + parquet_fallback: ParquetFallback, ) -> Self { Self { cached_row_group, @@ -140,6 +256,8 @@ impl LiquidCacheReaderInner { schema, batch_size, projection_columns, + parquet_fallback, + last_pull: None, } } @@ -151,6 +269,7 @@ impl LiquidCacheReaderInner { Box::pin(async move { let mut inner = self; let mut row_filter = row_filter; + inner.last_pull = None; let result = match inner .build_predicate_filter(&mut row_filter, selection) @@ -191,7 +310,7 @@ impl LiquidCacheReaderInner { break; } - let boolean_array = self + let boolean_array = match self .cached_row_group .evaluate_selection_with_predicate( self.current_batch_id, @@ -199,7 +318,13 @@ impl LiquidCacheReaderInner { predicate, ) .await - .expect("item must be in cache")?; + { + Some(result) => result?, + None => { + self.evaluate_predicate_after_materialize(&input_selection, predicate) + .await? + } + }; let boolean_mask = if boolean_array.null_count() == 0 { boolean_array.into_parts().0 @@ -215,7 +340,7 @@ impl LiquidCacheReaderInner { #[fastrace::trace] async fn read_from_cache( - &self, + &mut self, selection: &BooleanBuffer, ) -> Result, ArrowError> { let selected_rows = selection.count_set_bits(); @@ -232,7 +357,7 @@ impl LiquidCacheReaderInner { } let mut arrays = Vec::with_capacity(self.projection_columns.len()); - for &column_idx in &self.projection_columns { + for column_idx in self.projection_columns.clone() { let column = self .cached_row_group .get_column(column_idx as u64) @@ -244,13 +369,18 @@ impl LiquidCacheReaderInner { let array = column .get_arrow_array_with_filter(self.current_batch_id, selection) - .await - .ok_or_else(|| { - ArrowError::ComputeError(format!( - "column {column_idx} batch {} not cached", - *self.current_batch_id as usize - )) - })?; + .await; + + let array = match array { + Some(array) => array, + None => { + let record_batch = self + .read_parquet_batch_and_fill_cache(self.current_batch_id) + .await?; + let array = self.parquet_array(&record_batch, column_idx)?; + filter_array(array, selection)? + } + }; arrays.push(array); } @@ -259,6 +389,124 @@ impl LiquidCacheReaderInner { RecordBatch::try_new(self.schema.clone(), arrays).unwrap(), )) } + + async fn read_parquet_batch_and_fill_cache( + &mut self, + batch_id: BatchID, + ) -> Result { + if let Some((pulled_batch_id, record_batch)) = &self.last_pull + && *pulled_batch_id == batch_id + { + return Ok(record_batch.clone()); + } + + let record_batch = self + .parquet_fallback + .fetch_batch(batch_id) + .await + .map_err(|e| ArrowError::ComputeError(format!("parquet fallback read failed: {e}")))?; + + for (col_idx, file_column_id) in self + .parquet_fallback + .cache_column_ids + .iter() + .copied() + .enumerate() + { + let column = self + .cached_row_group + .get_column(file_column_id as u64) + .ok_or_else(|| { + ArrowError::ComputeError(format!( + "column {file_column_id} not present in liquid cache" + )) + })?; + let array = Arc::clone(record_batch.column(col_idx)); + + match column.insert(batch_id, array).await { + Ok(()) | Err(InsertArrowArrayError::AlreadyCached) => {} + Err(InsertArrowArrayError::CacheFull) => {} + } + } + + self.last_pull = Some((batch_id, record_batch.clone())); + Ok(record_batch) + } + + async fn evaluate_predicate_after_materialize( + &mut self, + selection: &BooleanBuffer, + predicate: &mut crate::reader::LiquidPredicate, + ) -> Result { + let record_batch = self + .read_parquet_batch_and_fill_cache(self.current_batch_id) + .await?; + + if let Some(result) = self + .cached_row_group + .evaluate_selection_with_predicate(self.current_batch_id, selection, predicate) + .await + { + return result; + } + + let column_ids = predicate.predicate_column_ids(); + let mut arrays = Vec::with_capacity(column_ids.len()); + let mut fields = Vec::with_capacity(column_ids.len()); + + for column_id in column_ids { + let array = self.parquet_array(&record_batch, column_id)?; + arrays.push(filter_array(array, selection)?); + + let field = self + .cached_row_group + .get_column(column_id as u64) + .ok_or_else(|| { + ArrowError::ComputeError(format!( + "column {column_id} not present in liquid cache" + )) + })? + .field() + .as_ref() + .clone(); + fields.push(field); + } + + let schema = Arc::new(Schema::new(fields)); + let predicate_batch = if arrays.is_empty() { + let options = + RecordBatchOptions::new().with_row_count(Some(selection.count_set_bits())); + RecordBatch::try_new_with_options(schema, arrays, &options)? + } else { + RecordBatch::try_new(schema, arrays)? + }; + + predicate.evaluate(predicate_batch) + } + + fn parquet_array( + &self, + record_batch: &RecordBatch, + file_column_id: usize, + ) -> Result { + let position = self + .parquet_fallback + .cache_column_ids + .iter() + .position(|column_id| *column_id == file_column_id) + .ok_or_else(|| { + ArrowError::ComputeError(format!( + "column {file_column_id} not present in parquet fallback projection" + )) + })?; + + Ok(Arc::clone(record_batch.column(position))) + } +} + +fn filter_array(array: ArrayRef, selection: &BooleanBuffer) -> Result { + let selection_array = BooleanArray::new(selection.clone(), None); + arrow::compute::filter(array.as_ref(), &selection_array) } #[cfg(test)] @@ -266,11 +514,14 @@ mod tests { use super::*; use crate::{ cache::LiquidCacheParquet, + reader::plantime::CachedMetaReaderFactory, reader::{FilterCandidateBuilder, LiquidPredicate, LiquidRowFilter}, }; use arrow::array::{ArrayRef, Int32Array}; use arrow::record_batch::RecordBatch; use arrow_schema::{DataType, Field, Schema, SchemaRef}; + use datafusion::datasource::listing::PartitionedFile; + use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion::{ logical_expr::Operator, physical_expr::PhysicalExpr, @@ -280,31 +531,90 @@ mod tests { use futures::{StreamExt, pin_mut}; use liquid_cache::cache::{AlwaysHydrate, squeeze_policies::Evict}; use liquid_cache::cache_policies::LiquidPolicy; + use object_store::local::LocalFileSystem; use parquet::arrow::{ - ArrowWriter, + ArrowWriter, ProjectionMask, arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions, RowSelection, RowSelector}, }; + use std::fs::File; use std::sync::Arc; - async fn make_row_group( + struct TestRowGroup { batch_size: usize, - batches: &[Vec], - ) -> (CachedRowGroupRef, SchemaRef) { + row_group: CachedRowGroupRef, + schema: SchemaRef, + fallback: ParquetFallbackConfig, + _tmp_dir: tempfile::TempDir, + } + + struct ReaderRequest { + selection: RowSelection, + row_filter: Option, + projection_columns: Vec, + schema: SchemaRef, + } + + impl TestRowGroup { + fn reader(&self, request: ReaderRequest) -> LiquidCacheReader { + LiquidCacheReader::new(LiquidCacheReaderConfig { + batch_size: self.batch_size, + selection: request.selection, + row_filter: request.row_filter, + cached_row_group: Arc::clone(&self.row_group), + projection_columns: request.projection_columns, + schema: request.schema, + parquet_fallback: self.fallback.clone(), + }) + } + } + + async fn make_row_group(batch_size: usize, batches: &[Vec]) -> TestRowGroup { let tmp_dir = tempfile::tempdir().unwrap(); + let field = Arc::new(Field::new("col0", DataType::Int32, false)); + let schema = Arc::new(Schema::new(vec![field.clone()])); + let parquet_path = tmp_dir.path().join("data.parquet"); + let file = File::create(&parquet_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), None).unwrap(); + for values in batches { + let array: ArrayRef = Arc::new(Int32Array::from(values.clone())); + let batch = RecordBatch::try_new(Arc::clone(&schema), vec![array]).unwrap(); + writer.write(&batch).unwrap(); + } + writer.close().unwrap(); + + let metadata_file = File::open(&parquet_path).unwrap(); + let reader_metadata = + ArrowReaderMetadata::load(&metadata_file, ArrowReaderOptions::new()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new_with_prefix(tmp_dir.path()).unwrap()); + let partitioned_file = PartitionedFile::new( + "data.parquet", + std::fs::metadata(&parquet_path).unwrap().len(), + ); + let metrics = ExecutionPlanMetricsSet::new(); + let input = CachedMetaReaderFactory::new(object_store).create_liquid_reader( + 0, + partitioned_file, + None, + &metrics, + ); + let projection = ProjectionMask::roots( + reader_metadata.metadata().file_metadata().schema_descr(), + [0], + ); + let store = t4::mount(tmp_dir.path().join("liquid_cache.t4")) .await .unwrap(); let cache = LiquidCacheParquet::new( batch_size, usize::MAX, + usize::MAX, store, Box::new(LiquidPolicy::new()), Box::new(Evict), Box::new(AlwaysHydrate::new()), ) .await; - let field = Arc::new(Field::new("col0", DataType::Int32, false)); - let schema = Arc::new(Schema::new(vec![field.clone()])); let file = cache.register_or_get_file("test".to_string(), schema.clone()); let row_group = file.create_row_group(0, vec![]); let column = row_group.get_column(0).unwrap(); @@ -317,7 +627,21 @@ mod tests { .expect("cache insert"); } - (row_group, schema) + TestRowGroup { + batch_size, + row_group, + schema, + fallback: ParquetFallbackConfig { + row_group_idx: 0, + metadata: Arc::clone(reader_metadata.metadata()), + input, + cache_projection: projection, + cache_column_ids: vec![0], + cache_batch_size: batch_size, + row_count: flatten_batches(batches).len(), + }, + _tmp_dir: tmp_dir, + } } fn flatten_batches(batches: &[Vec]) -> Vec { @@ -377,11 +701,15 @@ mod tests { #[tokio::test] async fn reads_batches_in_order() { let batch_size = 2; - let (row_group, schema) = make_row_group(batch_size, &[vec![1, 2], vec![3, 4]]).await; + let test = make_row_group(batch_size, &[vec![1, 2], vec![3, 4]]).await; let selection = RowSelection::from(vec![RowSelector::select(4)]); - let reader = - LiquidCacheReader::new(batch_size, selection, None, row_group, vec![0], schema); + let reader = test.reader(ReaderRequest { + selection, + row_filter: None, + projection_columns: vec![0], + schema: Arc::clone(&test.schema), + }); let batches = collect_batches(reader); assert_eq!(batches.len(), 2); @@ -392,11 +720,15 @@ mod tests { #[tokio::test] async fn skips_unselected_batches() { let batch_size = 2; - let (row_group, schema) = make_row_group(batch_size, &[vec![1, 2], vec![3, 4]]).await; + let test = make_row_group(batch_size, &[vec![1, 2], vec![3, 4]]).await; let selection = RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(2)]); - let reader = - LiquidCacheReader::new(batch_size, selection, None, row_group, vec![0], schema); + let reader = test.reader(ReaderRequest { + selection, + row_filter: None, + projection_columns: vec![0], + schema: Arc::clone(&test.schema), + }); let batches = collect_batches(reader); assert_eq!(batches.len(), 1); @@ -406,17 +738,15 @@ mod tests { #[tokio::test] async fn empty_projection_emits_schema_only_batches() { let batch_size = 2; - let (row_group, _) = make_row_group(batch_size, &[vec![10, 11]]).await; + let test = make_row_group(batch_size, &[vec![10, 11]]).await; let selection = RowSelection::from(vec![RowSelector::select(2)]); - let reader = LiquidCacheReader::new( - batch_size, + let reader = test.reader(ReaderRequest { selection, - None, - row_group, - Vec::new(), - Arc::new(Schema::new(Vec::::new())), - ); + row_filter: None, + projection_columns: Vec::new(), + schema: Arc::new(Schema::new(Vec::::new())), + }); let batches = collect_batches(reader); assert_eq!(batches.len(), 1); @@ -428,18 +758,16 @@ mod tests { #[tokio::test] async fn into_filter_returns_stored_filter_after_completion() { let batch_size = 2; - let (row_group, schema) = make_row_group(batch_size, &[vec![1, 2]]).await; + let test = make_row_group(batch_size, &[vec![1, 2]]).await; let selection = RowSelection::from(Vec::::new()); let filter = LiquidRowFilter::new(Vec::new()); - let mut reader = LiquidCacheReader::new( - batch_size, + let mut reader = test.reader(ReaderRequest { selection, - Some(filter), - row_group, - vec![0], - schema, - ); + row_filter: Some(filter), + projection_columns: vec![0], + schema: Arc::clone(&test.schema), + }); let waker = futures::task::noop_waker(); let mut cx = Context::from_waker(&waker); @@ -456,18 +784,16 @@ mod tests { let batches = vec![vec![1, 2], vec![3, 4]]; let batch_size = 2; let all_values = flatten_batches(&batches); - let (row_group, schema) = make_row_group(batch_size, &batches).await; - let filter = make_gt_filter(Arc::clone(&schema), &all_values, 2); + let test = make_row_group(batch_size, &batches).await; + let filter = make_gt_filter(Arc::clone(&test.schema), &all_values, 2); let selection = RowSelection::from(vec![RowSelector::select(4)]); - let reader = LiquidCacheReader::new( - batch_size, + let reader = test.reader(ReaderRequest { selection, - Some(filter), - row_group, - vec![0], - schema, - ); + row_filter: Some(filter), + projection_columns: vec![0], + schema: Arc::clone(&test.schema), + }); let batches = collect_batches(reader); assert_eq!(batches.len(), 1); @@ -499,18 +825,16 @@ mod tests { let batches = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; let batch_size = 2; let all_values = flatten_batches(&batches); - let (row_group, schema) = make_row_group(batch_size, &batches).await; - let filter = make_or_filter(Arc::clone(&schema), &all_values, 4, 2); + let test = make_row_group(batch_size, &batches).await; + let filter = make_or_filter(Arc::clone(&test.schema), &all_values, 4, 2); let selection = RowSelection::from(vec![RowSelector::select(6)]); - let reader = LiquidCacheReader::new( - batch_size, + let reader = test.reader(ReaderRequest { selection, - Some(filter), - row_group, - vec![0], - schema, - ); + row_filter: Some(filter), + projection_columns: vec![0], + schema: Arc::clone(&test.schema), + }); let batches = collect_batches(reader); assert_eq!(batches.len(), 2); @@ -523,22 +847,20 @@ mod tests { let batches = vec![vec![1, 2, 3, 4]]; let batch_size = 4; let all_values = flatten_batches(&batches); - let (row_group, schema) = make_row_group(batch_size, &batches).await; - let filter = make_gt_filter(Arc::clone(&schema), &all_values, 2); + let test = make_row_group(batch_size, &batches).await; + let filter = make_gt_filter(Arc::clone(&test.schema), &all_values, 2); let selection = RowSelection::from(vec![ RowSelector::skip(1), RowSelector::select(2), RowSelector::skip(1), ]); - let reader = LiquidCacheReader::new( - batch_size, + let reader = test.reader(ReaderRequest { selection, - Some(filter), - row_group, - vec![0], - schema, - ); + row_filter: Some(filter), + projection_columns: vec![0], + schema: Arc::clone(&test.schema), + }); let mut batches = collect_batches(reader); assert_eq!(batches.len(), 1); @@ -550,18 +872,16 @@ mod tests { let batches = vec![vec![1, 2]]; let batch_size = 2; let all_values = flatten_batches(&batches); - let (row_group, schema) = make_row_group(batch_size, &batches).await; - let filter = make_gt_filter(Arc::clone(&schema), &all_values, 10); + let test = make_row_group(batch_size, &batches).await; + let filter = make_gt_filter(Arc::clone(&test.schema), &all_values, 10); let selection = RowSelection::from(vec![RowSelector::select(2)]); - let reader = LiquidCacheReader::new( - batch_size, + let reader = test.reader(ReaderRequest { selection, - Some(filter), - row_group, - vec![0], - schema, - ); + row_filter: Some(filter), + projection_columns: vec![0], + schema: Arc::clone(&test.schema), + }); let next_batch = futures::executor::block_on(async { pin_mut!(reader); diff --git a/src/datafusion/src/reader/runtime/liquid_stream.rs b/src/datafusion/src/reader/runtime/liquid_stream.rs index b93dfb2f..2933d5d2 100644 --- a/src/datafusion/src/reader/runtime/liquid_stream.rs +++ b/src/datafusion/src/reader/runtime/liquid_stream.rs @@ -1,15 +1,14 @@ -use crate::cache::{BatchID, CachedFileRef, CachedRowGroupRef, InsertArrowArrayError}; +use crate::cache::{CachedFileRef, CachedRowGroupRef}; use crate::reader::plantime::{LiquidRowFilter, ParquetMetadataCacheReader}; use arrow::array::RecordBatch; use arrow_schema::{Schema, SchemaRef}; use fastrace::Event; use fastrace::local::LocalSpan; -use futures::{FutureExt, Stream, StreamExt, future::BoxFuture}; -use parquet::arrow::arrow_reader::{ArrowPredicate, ArrowReaderMetadata, ArrowReaderOptions}; +use futures::Stream; use parquet::{ arrow::{ - ParquetRecordBatchStreamBuilder, ProjectionMask, - arrow_reader::{RowSelection, RowSelector}, + ProjectionMask, + arrow_reader::{ArrowPredicate, RowSelection, RowSelector}, }, errors::ParquetError, file::metadata::ParquetMetaData, @@ -22,11 +21,12 @@ use std::{ task::{Context, Poll}, }; -use super::liquid_cache_reader::LiquidCacheReader; +use super::liquid_cache_reader::{ + LiquidCacheReader, LiquidCacheReaderConfig, ParquetFallbackConfig, +}; use super::utils::{get_root_column_ids, limit_row_selection, offset_row_selection}; type PlanResult = Option; -type FillCacheResult = Result<(ReaderFactory, PlanningContext), ParquetError>; struct ReaderFactory { metadata: Arc, @@ -99,18 +99,11 @@ impl ReaderFactory { *limit -= rows_after; } - let row_count = meta.num_rows() as usize; - let cache_batch_size = self.cached_file.batch_size(); - let mut cache_projection = projection.clone(); if let Some(ref predicate_projection) = predicate_projection { cache_projection.union(predicate_projection); } - let selection_for_cache = selection.clone(); - let selection_batches = - collect_selection_batches(&selection_for_cache, cache_batch_size, row_count); - let schema_descr = self.metadata.file_metadata().schema_descr(); let cache_column_ids = get_root_column_ids(schema_descr, &cache_projection); let predicate_column_ids = if let Some(ref predicate_projection) = predicate_projection { @@ -123,8 +116,6 @@ impl ReaderFactory { .create_row_group(row_group_idx as u64, predicate_column_ids); let projection_column_ids = get_root_column_ids(schema_descr, &projection); - let missing_batches = - compute_missing_batches(&cached_row_group, &cache_column_ids, &selection_batches); let context = PlanningContext { row_group_idx, @@ -134,104 +125,10 @@ impl ReaderFactory { cache_projection, projection_column_ids, cache_column_ids, - missing_batches, }; Some(context) } - - /// Fills the cache by reading missing batches from parquet using official parquet reader - async fn fill_cache_from_parquet(self, context: PlanningContext) -> FillCacheResult { - let row_count = self.metadata.row_group(context.row_group_idx).num_rows() as usize; - let cache_batch_size = context.cached_row_group.batch_size(); - - if context.cache_column_ids.is_empty() || context.missing_batches.is_empty() { - return Ok((self, context)); - } - - // Build row selection for the missing batches - let backfill_selection = - build_selection_for_batches(&context.missing_batches, cache_batch_size, row_count); - - if !backfill_selection.selects_any() { - return Ok((self, context)); - } - - // Clone the reader for this operation (cheap since it's Arc-based) - let reader_clone: ParquetMetadataCacheReader = self.input.clone(); - - // Use official parquet async reader - let options = ArrowReaderOptions::new(); - let reader_metadata = ArrowReaderMetadata::try_new(Arc::clone(&self.metadata), options)?; - - let mut stream = - ParquetRecordBatchStreamBuilder::new_with_metadata(reader_clone, reader_metadata) - .with_projection(context.cache_projection.clone()) - .with_row_groups(vec![context.row_group_idx]) - .with_row_selection(backfill_selection) - .with_batch_size(cache_batch_size) - .build()?; - - let mut processed_batches = 0usize; - - // Get the original column indices in projection order - let column_ids = get_root_column_ids( - self.metadata.file_metadata().schema_descr(), - &context.cache_projection, - ); - - while let Some(batch_result) = stream.next().await { - let record_batch = batch_result?; - if record_batch.num_rows() == 0 { - continue; - } - - let Some(batch_id) = context.missing_batches.get(processed_batches) else { - return Err(ParquetError::General( - "parquet stream produced more batches than expected".to_string(), - )); - }; - - let batch_index = usize::from(**batch_id); - let batch_start = batch_index * cache_batch_size; - let expected_len = ((batch_index + 1) * cache_batch_size) - .min(row_count) - .saturating_sub(batch_start.min(row_count)); - - debug_assert!( - record_batch.num_rows() <= cache_batch_size, - "parquet batch larger than cache batch size" - ); - debug_assert_eq!( - record_batch.num_rows(), - expected_len, - "parquet batch length does not match expected cache slice" - ); - - let batch_id = *batch_id; - insert_batch_into_cache( - &record_batch, - &column_ids, - batch_id, - cache_batch_size, - row_count, - &context.cached_row_group, - ) - .await?; - - processed_batches += 1; - } - - if processed_batches != context.missing_batches.len() { - return Err(ParquetError::General(format!( - "expected {} batches from parquet stream, received {}", - context.missing_batches.len(), - processed_batches - ))); - } - - Ok((self, context)) - } } fn build_projection_schema(file_schema: &SchemaRef, projection_column_ids: &[usize]) -> SchemaRef { @@ -243,166 +140,6 @@ fn build_projection_schema(file_schema: &SchemaRef, projection_column_ids: &[usi Arc::new(Schema::new(fields)) } -fn collect_selection_batches( - selection: &RowSelection, - batch_size: usize, - row_count: usize, -) -> Vec { - let mut batches = Vec::new(); - let mut current_row = 0usize; - let selectors: Vec = selection.clone().into(); - - for selector in selectors { - if selector.skip { - current_row += selector.row_count; - continue; - } - - let start = current_row; - let end = (current_row + selector.row_count).min(row_count); - if start >= end { - current_row = current_row.saturating_add(selector.row_count); - continue; - } - - let start_batch = start / batch_size; - let end_batch = (end - 1) / batch_size; - for batch_idx in start_batch..=end_batch { - let batch_id = BatchID::from_raw(batch_idx as u16); - let is_duplicate = batches.last().is_some_and(|last| last == &batch_id); - if !is_duplicate { - batches.push(batch_id); - } - } - current_row += selector.row_count; - } - - batches -} - -fn compute_missing_batches( - cached_row_group: &CachedRowGroupRef, - column_ids: &[usize], - selection_batches: &[BatchID], -) -> Vec { - if column_ids.is_empty() || selection_batches.is_empty() { - return Vec::new(); - } - - let mut columns = Vec::with_capacity(column_ids.len()); - for &column_idx in column_ids { - columns.push(cached_row_group.get_column(column_idx as u64)); - } - - let mut missing = Vec::new(); - - 'batch: for &batch_id in selection_batches { - for column in &columns { - match column { - Some(column) => { - if !column.is_cached(batch_id) { - if missing.last().is_some_and(|last| last == &batch_id) { - continue 'batch; - } - missing.push(batch_id); - continue 'batch; - } - } - None => { - if missing.last().is_some_and(|last| last == &batch_id) { - continue 'batch; - } - missing.push(batch_id); - continue 'batch; - } - } - } - } - - missing -} - -fn build_selection_for_batches( - batches: &[BatchID], - batch_size: usize, - row_count: usize, -) -> RowSelection { - if batches.is_empty() { - return RowSelection::from(Vec::::new()); - } - - let mut selectors = Vec::new(); - let mut current_row = 0usize; - - for batch_id in batches { - let batch_idx = usize::from(**batch_id); - let start = batch_idx * batch_size; - if start >= row_count { - continue; - } - let end = ((batch_idx + 1) * batch_size).min(row_count); - - if start > current_row { - selectors.push(RowSelector::skip(start - current_row)); - } - - selectors.push(RowSelector::select(end - start)); - current_row = end; - } - - RowSelection::from(selectors) -} - -async fn insert_batch_into_cache( - record_batch: &RecordBatch, - column_ids: &[usize], - batch_id: BatchID, - batch_size: usize, - row_count: usize, - cached_row_group: &CachedRowGroupRef, -) -> Result<(), ParquetError> { - if column_ids.is_empty() || record_batch.num_rows() == 0 { - return Ok(()); - } - - debug_assert_eq!(record_batch.num_columns(), column_ids.len()); - - let batch_idx = usize::from(*batch_id); - let start = batch_idx * batch_size; - if start >= row_count { - return Ok(()); - } - let end = ((batch_idx + 1) * batch_size).min(row_count); - let len = end - start; - - debug_assert!( - len <= batch_size, - "cache batch length exceeded configured batch size" - ); - debug_assert_eq!( - record_batch.num_rows(), - len, - "record batch length does not match cache batch window" - ); - - for (col_idx, column_id) in column_ids.iter().enumerate() { - let column = cached_row_group.get_column(*column_id as u64).unwrap(); - let array = Arc::clone(record_batch.column(col_idx)); - - if let Err(err) = column.insert(batch_id, array).await - && !matches!(err, InsertArrowArrayError::AlreadyCached) - { - return Err(ParquetError::General(format!( - "Failed to insert batch {} for column {} into cache: {err:?}", - batch_idx, column_id - ))); - } - debug_assert!(column.is_cached(batch_id)); - } - - Ok(()) -} - /// Context for planning what to read from cache vs parquet struct PlanningContext { row_group_idx: usize, @@ -412,23 +149,48 @@ struct PlanningContext { cache_projection: ProjectionMask, projection_column_ids: Vec, cache_column_ids: Vec, - missing_batches: Vec, +} + +fn build_liquid_cache_reader( + reader_factory: &mut ReaderFactory, + context: PlanningContext, + schema: SchemaRef, +) -> LiquidCacheReader { + let row_count = reader_factory + .metadata + .row_group(context.row_group_idx) + .num_rows() as usize; + let cache_batch_size = context.cached_row_group.batch_size(); + LiquidCacheReader::new(LiquidCacheReaderConfig { + batch_size: context.batch_size, + selection: context.selection, + row_filter: reader_factory.filter.take(), + cached_row_group: context.cached_row_group, + projection_columns: context.projection_column_ids, + schema, + parquet_fallback: ParquetFallbackConfig { + row_group_idx: context.row_group_idx, + metadata: Arc::clone(&reader_factory.metadata), + input: reader_factory.input.clone(), + cache_projection: context.cache_projection, + cache_column_ids: context.cache_column_ids, + cache_batch_size, + row_count, + }, + }) } enum StreamState { /// At the start of a new row group, or the end of the parquet stream Init, - /// Reading from parquet and filling cache - FillCache(BoxFuture<'static, FillCacheResult>), /// Decoding a batch from cache - ReadFromCache(LiquidCacheReader), + ReadFromCache(Box), } impl std::fmt::Debug for StreamState { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { StreamState::Init => write!(f, "StreamState::Init"), - StreamState::FillCache(_) => write!(f, "StreamState::FillingCache"), StreamState::ReadFromCache(_) => write!(f, "StreamState::Decoding"), } } @@ -603,7 +365,7 @@ impl Stream for LiquidStream { match state { StreamState::ReadFromCache(mut batch_reader) => { - match Pin::new(&mut batch_reader).poll_next(cx) { + match Pin::new(&mut *batch_reader).poll_next(cx) { Poll::Ready(Some(Ok(batch))) => { self.state = StreamState::ReadFromCache(batch_reader); return Poll::Ready(Some(Ok(batch))); @@ -612,6 +374,7 @@ impl Stream for LiquidStream { panic!("Decoding next batch error: {e:?}"); } Poll::Ready(None) => { + let batch_reader = *batch_reader; let filter = batch_reader.into_filter(); self.reader.as_mut().unwrap().filter = filter; // state left as Init, continue loop to plan next row group @@ -643,57 +406,18 @@ impl Stream for LiquidStream { ); match maybe_context { Some(context) => { - if !context.missing_batches.is_empty() - && !context.cache_column_ids.is_empty() - { - LocalSpan::add_event(Event::new("LiquidStream::fill_cache")); - let reader = self.reader.take().expect("lost reader"); - let fut = reader.fill_cache_from_parquet(context).boxed(); - self.state = StreamState::FillCache(fut); - } else { - LocalSpan::add_event(Event::new("LiquidStream::read_from_cache")); - let reader_factory = self.reader.as_mut().unwrap(); - let batch_reader = LiquidCacheReader::new( - context.batch_size, - context.selection, - reader_factory.filter.take(), - context.cached_row_group, - context.projection_column_ids, - Arc::clone(&self.schema), - ); - self.state = StreamState::ReadFromCache(batch_reader); - } + LocalSpan::add_event(Event::new("LiquidStream::read_from_cache")); + let schema = Arc::clone(&self.schema); + let reader_factory = self.reader.as_mut().unwrap(); + let batch_reader = + build_liquid_cache_reader(reader_factory, context, schema); + self.state = StreamState::ReadFromCache(Box::new(batch_reader)); } None => { self.state = StreamState::Init; } } } - StreamState::FillCache(mut f) => match f.as_mut().poll(cx) { - Poll::Pending => { - self.state = StreamState::FillCache(f); - return Poll::Pending; - } - Poll::Ready(result) => match result { - Ok((reader_factory, context)) => { - self.reader = Some(reader_factory); - LocalSpan::add_event(Event::new("LiquidStream::read_from_cache")); - let reader_factory = self.reader.as_mut().unwrap(); - let batch_reader = LiquidCacheReader::new( - context.batch_size, - context.selection, - reader_factory.filter.take(), - context.cached_row_group, - context.projection_column_ids, - Arc::clone(&self.schema), - ); - self.state = StreamState::ReadFromCache(batch_reader); - } - Err(e) => { - panic!("Filling cache error: {e:?}"); - } - }, - }, } } } @@ -702,31 +426,316 @@ impl Stream for LiquidStream { #[cfg(test)] mod tests { use super::*; - use crate::cache::LiquidCacheParquet; - use arrow::array::{ArrayRef, Int32Array}; + use crate::cache::{BatchID, CachedFileRef, LiquidCacheParquet}; + use crate::reader::plantime::{ + CachedMetaReaderFactory, FilterCandidateBuilder, LiquidPredicate, + }; + use arrow::array::{Array, ArrayRef, Int32Array}; use arrow_schema::{DataType, Field, Schema}; + use datafusion::common::ScalarValue; + use datafusion::datasource::listing::PartitionedFile; + use datafusion::logical_expr::Operator; + use datafusion::physical_expr::PhysicalExpr; + use datafusion::physical_expr::expressions::{BinaryExpr, Column, Literal}; + use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; + use futures::StreamExt; use liquid_cache::cache::AlwaysHydrate; use liquid_cache::cache::squeeze_policies::Evict; use liquid_cache::cache_policies::LiquidPolicy; - use parquet::arrow::arrow_reader::RowSelection; + use object_store::local::LocalFileSystem; + use parquet::arrow::ArrowWriter; + use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; + use std::fs::File; use std::sync::Arc; - async fn make_cache(batch_size: usize, schema: SchemaRef) -> CachedRowGroupRef { + fn write_two_row_group_file(path: &std::path::Path, schema: SchemaRef) { + let file = File::create(path).unwrap(); + let mut writer = ArrowWriter::try_new(file, schema.clone(), None).unwrap(); + let batch0 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![0, 1, 2, 3])), + Arc::new(Int32Array::from(vec![10, 11, 12, 13])), + ], + ) + .unwrap(); + let batch1 = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![4, 5, 6, 7])), + Arc::new(Int32Array::from(vec![14, 15, 16, 17])), + ], + ) + .unwrap(); + writer.write(&batch0).unwrap(); + writer.flush().unwrap(); + writer.write(&batch1).unwrap(); + writer.close().unwrap(); + } + + fn write_single_row_group_file(path: &std::path::Path, schema: SchemaRef, a: Vec) { + let file = File::create(path).unwrap(); + let mut writer = ArrowWriter::try_new(file, schema.clone(), None).unwrap(); + let b: Vec<_> = a.iter().map(|value| value + 1000).collect(); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(Int32Array::from(a)), Arc::new(Int32Array::from(b))], + ) + .unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + } + + async fn make_liquid_stream( + max_memory_bytes: usize, + max_disk_bytes: usize, + row_filter: Option, + ) -> ( + LiquidStream, + Arc, + CachedFileRef, + tempfile::TempDir, + ) { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); let tmp_dir = tempfile::tempdir().unwrap(); + let parquet_path = tmp_dir.path().join("data.parquet"); + write_two_row_group_file(&parquet_path, schema.clone()); + let metadata_file = File::open(&parquet_path).unwrap(); + let reader_metadata = + ArrowReaderMetadata::load(&metadata_file, ArrowReaderOptions::new()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new_with_prefix(tmp_dir.path()).unwrap()); + let partitioned_file = PartitionedFile::new( + "data.parquet", + std::fs::metadata(&parquet_path).unwrap().len(), + ); + let metrics = ExecutionPlanMetricsSet::new(); + let input = CachedMetaReaderFactory::new(object_store).create_liquid_reader( + 0, + partitioned_file, + None, + &metrics, + ); + let store = t4::mount(tmp_dir.path().join("liquid_cache.t4")) .await .unwrap(); - let cache = LiquidCacheParquet::new( - batch_size, - usize::MAX, - store, - Box::new(LiquidPolicy::new()), - Box::new(Evict), - Box::new(AlwaysHydrate::new()), - ) - .await; - let file = cache.register_or_get_file("test.parquet".to_string(), schema); - file.create_row_group(0, vec![]) + let cache = Arc::new( + LiquidCacheParquet::new( + 4, + max_memory_bytes, + max_disk_bytes, + store, + Box::new(LiquidPolicy::new()), + Box::new(Evict), + Box::new(AlwaysHydrate::new()), + ) + .await, + ); + let cached_file = cache.register_or_get_file("data.parquet".to_string(), schema); + let projection = ProjectionMask::roots( + reader_metadata.metadata().file_metadata().schema_descr(), + [0, 1], + ); + let mut builder = LiquidStreamBuilder::new(input, Arc::clone(reader_metadata.metadata())) + .with_batch_size(4) + .with_row_groups(vec![0, 1]) + .with_projection(projection); + if let Some(row_filter) = row_filter { + builder = builder.with_row_filter(row_filter); + } + let stream = builder.build(cached_file.clone()).unwrap(); + (stream, cache, cached_file, tmp_dir) + } + + async fn collect_liquid_values(stream: LiquidStream) -> (Vec, Vec) { + let batches = stream + .map(|batch| batch.expect("valid liquid stream batch")) + .collect::>() + .await; + let mut a = Vec::new(); + let mut b = Vec::new(); + for batch in batches { + let a_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let b_array = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + a.extend(a_array.iter().map(|value| value.unwrap())); + b.extend(b_array.iter().map(|value| value.unwrap())); + } + (a, b) + } + + async fn collect_projected_a(stream: LiquidStream) -> Vec { + let batches = stream + .map(|batch| batch.expect("valid liquid stream batch")) + .collect::>() + .await; + let mut a = Vec::new(); + for batch in batches { + let a_array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + a.extend(a_array.iter().map(|value| value.unwrap())); + } + a + } + + fn gt_filter(schema: SchemaRef, literal: i32) -> LiquidRowFilter { + gt_filter_on(schema, "a", 0, literal) + } + + fn gt_filter_on( + schema: SchemaRef, + col_name: &str, + col_idx: usize, + literal: i32, + ) -> LiquidRowFilter { + let expr: Arc = Arc::new(BinaryExpr::new( + Arc::new(Column::new(col_name, col_idx)), + Operator::Gt, + Arc::new(Literal::new(ScalarValue::Int32(Some(literal)))), + )); + let tmp_meta = tempfile::NamedTempFile::new().unwrap(); + write_two_row_group_file(tmp_meta.path(), schema.clone()); + let file = File::open(tmp_meta.path()).unwrap(); + let metadata = ArrowReaderMetadata::load(&file, ArrowReaderOptions::new()).unwrap(); + let builder = FilterCandidateBuilder::new(expr, schema); + let candidate = builder.build(metadata.metadata()).unwrap().unwrap(); + let projection = candidate.projection(metadata.metadata()); + let predicate = LiquidPredicate::try_new(candidate, projection).unwrap(); + LiquidRowFilter::new(vec![predicate]) + } + + async fn make_liquid_stream_with_projection( + max_memory_bytes: usize, + max_disk_bytes: usize, + row_filter: Option, + projection_columns: Vec, + ) -> ( + LiquidStream, + Arc, + CachedFileRef, + tempfile::TempDir, + ) { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + let tmp_dir = tempfile::tempdir().unwrap(); + let parquet_path = tmp_dir.path().join("data.parquet"); + write_two_row_group_file(&parquet_path, schema.clone()); + let metadata_file = File::open(&parquet_path).unwrap(); + let reader_metadata = + ArrowReaderMetadata::load(&metadata_file, ArrowReaderOptions::new()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new_with_prefix(tmp_dir.path()).unwrap()); + let partitioned_file = PartitionedFile::new( + "data.parquet", + std::fs::metadata(&parquet_path).unwrap().len(), + ); + let metrics = ExecutionPlanMetricsSet::new(); + let input = CachedMetaReaderFactory::new(object_store).create_liquid_reader( + 0, + partitioned_file, + None, + &metrics, + ); + + let store = t4::mount(tmp_dir.path().join("liquid_cache.t4")) + .await + .unwrap(); + let cache = Arc::new( + LiquidCacheParquet::new( + 4, + max_memory_bytes, + max_disk_bytes, + store, + Box::new(LiquidPolicy::new()), + Box::new(Evict), + Box::new(AlwaysHydrate::new()), + ) + .await, + ); + let cached_file = cache.register_or_get_file("data.parquet".to_string(), schema); + let projection = ProjectionMask::roots( + reader_metadata.metadata().file_metadata().schema_descr(), + projection_columns, + ); + let mut builder = LiquidStreamBuilder::new(input, Arc::clone(reader_metadata.metadata())) + .with_batch_size(4) + .with_row_groups(vec![0, 1]) + .with_projection(projection); + if let Some(row_filter) = row_filter { + builder = builder.with_row_filter(row_filter); + } + let stream = builder.build(cached_file.clone()).unwrap(); + (stream, cache, cached_file, tmp_dir) + } + + async fn make_single_row_group_stream( + parquet_a: Vec, + projection_columns: Vec, + ) -> (LiquidStream, CachedFileRef, tempfile::TempDir) { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + let tmp_dir = tempfile::tempdir().unwrap(); + let parquet_path = tmp_dir.path().join("data.parquet"); + write_single_row_group_file(&parquet_path, schema.clone(), parquet_a); + let metadata_file = File::open(&parquet_path).unwrap(); + let reader_metadata = + ArrowReaderMetadata::load(&metadata_file, ArrowReaderOptions::new()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new_with_prefix(tmp_dir.path()).unwrap()); + let partitioned_file = PartitionedFile::new( + "data.parquet", + std::fs::metadata(&parquet_path).unwrap().len(), + ); + let metrics = ExecutionPlanMetricsSet::new(); + let input = CachedMetaReaderFactory::new(object_store).create_liquid_reader( + 0, + partitioned_file, + None, + &metrics, + ); + + let store = t4::mount(tmp_dir.path().join("liquid_cache.t4")) + .await + .unwrap(); + let cache = Arc::new( + LiquidCacheParquet::new( + 4, + usize::MAX, + usize::MAX, + store, + Box::new(LiquidPolicy::new()), + Box::new(Evict), + Box::new(AlwaysHydrate::new()), + ) + .await, + ); + let cached_file = cache.register_or_get_file("data.parquet".to_string(), schema); + let projection = ProjectionMask::roots( + reader_metadata.metadata().file_metadata().schema_descr(), + projection_columns, + ); + let stream = LiquidStreamBuilder::new(input, Arc::clone(reader_metadata.metadata())) + .with_batch_size(4) + .with_row_groups(vec![0]) + .with_projection(projection) + .build(cached_file.clone()) + .unwrap(); + (stream, cached_file, tmp_dir) } async fn insert_batches( @@ -744,139 +753,132 @@ mod tests { } } - #[test] - fn collect_selection_batches_marks_all_selected_batches() { - let selection = RowSelection::from(vec![ - RowSelector::select(3), - RowSelector::skip(2), - RowSelector::select(5), - ]); - let batches = collect_selection_batches(&selection, 4, 10); - let expected = vec![ - BatchID::from_raw(0), - BatchID::from_raw(1), - BatchID::from_raw(2), - ]; - assert_eq!(batches, expected); + async fn is_cached(row_group: &CachedRowGroupRef, column_id: usize, batch_idx: u16) -> bool { + row_group + .get_column(column_id as u64) + .unwrap() + .get_arrow_array_test_only(BatchID::from_raw(batch_idx)) + .await + .is_some() } - #[test] - fn collect_selection_batches_handles_empty_selection() { - let selection = RowSelection::from(vec![]); - let batches = collect_selection_batches(&selection, 4, 10); - let expected: Vec = vec![]; - assert_eq!(batches, expected); - } + #[tokio::test] + async fn cache_full_keeps_inserted_batches_and_skips_failed_inserts() { + let one_array_memory = Arc::new(Int32Array::from(vec![0, 1, 2, 3])).get_array_memory_size(); + let (stream, _cache, cached_file, _tmp_dir) = + make_liquid_stream(one_array_memory * 3, 0, None).await; - #[test] - fn collect_selection_batches_handles_selection_beyond_row_count() { - let selection = RowSelection::from(vec![ - RowSelector::select(5), // Select 5 rows - RowSelector::skip(2), // Skip 2 rows - RowSelector::select(10), // Select 10 rows (but only 3 rows left) - ]); - let batches = collect_selection_batches(&selection, 4, 8); - // Total rows: 8 - // First selector: select 5 rows (rows 0-4) -> batches 0, 1 - // Skip 2 rows (rows 5-6) - // Third selector: select 10 rows from row 7, but only 1 row left -> batch 1 - let expected = vec![BatchID::from_raw(0), BatchID::from_raw(1)]; - assert_eq!(batches, expected); + let (a, b) = collect_liquid_values(stream).await; + + assert_eq!(a, vec![0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(b, vec![10, 11, 12, 13, 14, 15, 16, 17]); + + let row_group0 = cached_file.create_row_group(0, vec![]); + let row_group1 = cached_file.create_row_group(1, vec![]); + assert!(is_cached(&row_group0, 0, 0).await); + assert!(is_cached(&row_group0, 1, 0).await); + assert!(is_cached(&row_group1, 0, 0).await); + assert!(!is_cached(&row_group1, 1, 0).await); } #[tokio::test] - async fn compute_missing_batches_identifies_partial_columns() { + async fn cache_full_with_row_filter_keeps_lookaside_results_correct() { let schema = Arc::new(Schema::new(vec![ - Field::new("col_0", DataType::Int32, false), - Field::new("col_1", DataType::Int32, false), - Field::new("col_2", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), ])); - let row_group = make_cache(4, schema.clone()).await; - insert_batches(&row_group, 0, &[(0, &[1, 2, 3, 4]), (2, &[9, 9, 9, 9])]).await; - insert_batches(&row_group, 2, &[(0, &[5, 6, 7, 8])]).await; - - let selection_batches = vec![ - BatchID::from_raw(0), - BatchID::from_raw(1), - BatchID::from_raw(2), - ]; + let one_array_memory = Arc::new(Int32Array::from(vec![0, 1, 2, 3])).get_array_memory_size(); + let filter = gt_filter(schema, 2); + let (stream, _cache, cached_file, _tmp_dir) = + make_liquid_stream(one_array_memory * 3, 0, Some(filter)).await; - let missing_for_col0 = compute_missing_batches(&row_group, &[0], &selection_batches); - assert_eq!(missing_for_col0, vec![BatchID::from_raw(1)]); + let (a, b) = collect_liquid_values(stream).await; - let missing_for_col2 = compute_missing_batches(&row_group, &[2], &selection_batches); - assert_eq!( - missing_for_col2, - vec![BatchID::from_raw(1), BatchID::from_raw(2),] - ); + assert_eq!(a, vec![3, 4, 5, 6, 7]); + assert_eq!(b, vec![13, 14, 15, 16, 17]); - let missing_for_col1 = compute_missing_batches(&row_group, &[1], &selection_batches); - assert_eq!( - missing_for_col1, - vec![ - BatchID::from_raw(0), - BatchID::from_raw(1), - BatchID::from_raw(2), - ] - ); + let row_group0 = cached_file.create_row_group(0, vec![]); + let row_group1 = cached_file.create_row_group(1, vec![]); + assert!(is_cached(&row_group0, 0, 0).await); + assert!(is_cached(&row_group0, 1, 0).await); + assert!(is_cached(&row_group1, 0, 0).await); + assert!(!is_cached(&row_group1, 1, 0).await); } - #[test] - fn build_selection_for_batches_generates_sparse_selectors() { - let selection = - build_selection_for_batches(&[BatchID::from_raw(1), BatchID::from_raw(3)], 4, 20); - let selectors: Vec = selection.into(); - assert_eq!( - selectors, - vec![ - RowSelector::skip(4), - RowSelector::select(4), - RowSelector::skip(4), - RowSelector::select(4), - ] - ); - } + #[tokio::test] + async fn mid_scan_eviction_recovers() { + let (stream, _cache, cached_file, _tmp_dir) = make_liquid_stream(0, 0, None).await; - #[test] - fn build_selection_for_batches_handles_empty_batches() { - let selection = build_selection_for_batches(&[], 4, 20); - let selectors: Vec = selection.into(); - assert_eq!(selectors, vec![]); + let (a, b) = collect_liquid_values(stream).await; + + assert_eq!(a, vec![0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(b, vec![10, 11, 12, 13, 14, 15, 16, 17]); + + let row_group0 = cached_file.create_row_group(0, vec![]); + let row_group1 = cached_file.create_row_group(1, vec![]); + assert!(!is_cached(&row_group0, 0, 0).await); + assert!(!is_cached(&row_group0, 1, 0).await); + assert!(!is_cached(&row_group1, 0, 0).await); + assert!(!is_cached(&row_group1, 1, 0).await); } - #[test] - fn build_selection_for_batches_handles_batch_beyond_row_count() { - let selection = - build_selection_for_batches(&[BatchID::from_raw(5), BatchID::from_raw(6)], 4, 16); - let selectors: Vec = selection.into(); - // Total rows: 16, so valid batches are 0-3 (rows 0-15) - // Batch 5: start=20, end=min(24,16)=16, but 20 >= 16, so skipped - // Batch 6: start=24, end=min(28,16)=16, but 24 >= 16, so skipped - // Result should be empty selection - assert_eq!(selectors, vec![]); + #[tokio::test] + async fn predicate_fallback_uses_predicate_projection() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])); + let one_array_memory = Arc::new(Int32Array::from(vec![0, 1, 2, 3])).get_array_memory_size(); + let filter = gt_filter_on(schema, "b", 1, 13); + let (stream, _cache, cached_file, _tmp_dir) = + make_liquid_stream_with_projection(one_array_memory * 3, 0, Some(filter), vec![0]) + .await; + + let a_values = collect_projected_a(stream).await; + + assert_eq!(a_values, vec![4, 5, 6, 7]); + + let row_group0 = cached_file.create_row_group(0, vec![]); + let row_group1 = cached_file.create_row_group(1, vec![]); + assert!(is_cached(&row_group0, 0, 0).await); + assert!(is_cached(&row_group0, 1, 0).await); + assert!(is_cached(&row_group1, 0, 0).await); + assert!(!is_cached(&row_group1, 1, 0).await); } - #[test] - fn build_selection_for_batches_handles_single_batch() { - let selection = build_selection_for_batches(&[BatchID::from_raw(2)], 4, 20); - let selectors: Vec = selection.into(); - // Batch 2: rows 8-11 - // Should skip 8 rows then select 4 rows - assert_eq!( - selectors, - vec![RowSelector::skip(8), RowSelector::select(4),] - ); + #[tokio::test] + async fn missing_column_falls_back_to_parquet() { + let (stream, _cache, cached_file, _tmp_dir) = + make_liquid_stream(usize::MAX, usize::MAX, None).await; + let row_group0 = cached_file.create_row_group(0, vec![]); + let row_group1 = cached_file.create_row_group(1, vec![]); + insert_batches(&row_group0, 0, &[(0, &[0, 1, 2, 3])]).await; + insert_batches(&row_group1, 0, &[(0, &[4, 5, 6, 7])]).await; + + let (a, b) = collect_liquid_values(stream).await; + + assert_eq!(a, vec![0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(b, vec![10, 11, 12, 13, 14, 15, 16, 17]); + assert!(is_cached(&row_group0, 1, 0).await); + assert!(is_cached(&row_group1, 1, 0).await); } - #[test] - fn build_selection_for_batches_handles_partial_last_batch() { - let selection = build_selection_for_batches(&[BatchID::from_raw(4)], 4, 18); - let selectors: Vec = selection.into(); - // Batch 4: start=16, end=min(20,18)=18 - // Should skip 16 rows then select 2 rows (18-16=2) - assert_eq!( - selectors, - vec![RowSelector::skip(16), RowSelector::select(2),] - ); + #[tokio::test] + async fn fallback_stream_advances_across_misses() { + let parquet_a = vec![ + 100, 101, 102, 103, 4, 5, 6, 7, 200, 201, 202, 203, 12, 13, 14, 15, + ]; + let (stream, cached_file, _tmp_dir) = + make_single_row_group_stream(parquet_a, vec![0]).await; + let row_group = cached_file.create_row_group(0, vec![]); + insert_batches(&row_group, 0, &[(0, &[0, 1, 2, 3]), (2, &[8, 9, 10, 11])]).await; + + let a_values = collect_projected_a(stream).await; + + assert_eq!(a_values, (0..16).collect::>()); + assert!(is_cached(&row_group, 0, 0).await); + assert!(is_cached(&row_group, 0, 1).await); + assert!(is_cached(&row_group, 0, 2).await); + assert!(is_cached(&row_group, 0, 3).await); } }