diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index 91a099a61..000000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,12 +0,0 @@ -[target.x86_64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - -[target.aarch64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2dc8b96fe..4c07b08bb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,34 +15,66 @@ # specific language governing permissions and limitations # under the License. -name: Python Release Build +# Reusable workflow for running building +# This ensures the same tests run for both debug (PRs) and release (main/tags) builds + +name: Build + on: - pull_request: - branches: ["main"] - push: - tags: ["*-rc*"] - branches: ["branch-*"] + workflow_call: + inputs: + build_mode: + description: 'Build mode: debug or release' + required: true + type: string + run_wheels: + description: 'Whether to build distribution wheels' + required: false + type: boolean + default: false + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 jobs: - build: + # ============================================ + # Linting Jobs + # ============================================ + lint-rust: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + with: + toolchain: "nightly" + components: rustfmt + + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + + - name: Check formatting + run: cargo +nightly fmt --all -- --check + + lint-python: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 + - name: Install Python - uses: actions/setup-python@v6 + uses: actions/setup-python@v5 with: python-version: "3.12" - - uses: astral-sh/setup-uv@v7 + - uses: astral-sh/setup-uv@v6 with: - enable-cache: true + enable-cache: true - # Use the --no-install-package to only install the dependencies - # but do not yet build the rust library - name: Install dependencies run: uv sync --dev --no-install-package datafusion - # Update output format to enable automatic inline annotations. - name: Run Ruff run: | uv run --no-project ruff check --output-format=github python/ @@ -50,26 +82,181 @@ jobs: - name: Run codespell run: | - uv run --no-project codespell --toml pyproject.toml + uv run --no-project codespell --toml pyproject.toml + + lint-toml: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Install taplo + uses: taiki-e/install-action@v2 + with: + tool: taplo-cli + + # if you encounter an error, try running 'taplo format' to fix the formatting automatically. + - name: Check Cargo.toml formatting + run: taplo format --check + + check-crates-patch: + if: inputs.build_mode == 'release' && startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Ensure [patch.crates-io] is empty + run: python3 dev/check_crates_patch.py generate-license: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 - - uses: astral-sh/setup-uv@v7 + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v6 with: - enable-cache: true + enable-cache: true + + - name: Install cargo-license + uses: taiki-e/install-action@v2 + with: + tool: cargo-license - name: Generate license file run: uv run --no-project python ./dev/create_license.py - - uses: actions/upload-artifact@v4 + + - uses: actions/upload-artifact@v6 with: name: python-wheel-license path: LICENSE.txt + # ============================================ + # Build - Linux x86_64 + # ============================================ + build-manylinux-x86_64: + needs: [generate-license, lint-rust, lint-python] + name: ManyLinux x86_64 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - run: rm LICENSE.txt + - name: Download LICENSE.txt + uses: actions/download-artifact@v7 + with: + name: python-wheel-license + path: . + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Build (release mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'release' + with: + target: x86_64-unknown-linux-gnu + manylinux: "2_28" + args: --release --strip --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Build (debug mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'debug' + with: + target: x86_64-unknown-linux-gnu + manylinux: "2_28" + args: --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Build FFI test library + uses: PyO3/maturin-action@v1 + with: + target: x86_64-unknown-linux-gnu + manylinux: "2_28" + working-directory: examples/datafusion-ffi-example + args: --out dist + rustup-components: rust-std + + - name: Archive wheels + uses: actions/upload-artifact@v6 + with: + name: dist-manylinux-x86_64 + path: dist/* + + - name: Archive FFI test wheel + uses: actions/upload-artifact@v6 + with: + name: test-ffi-manylinux-x86_64 + path: examples/datafusion-ffi-example/dist/* + + # ============================================ + # Build - Linux ARM64 + # ============================================ + build-manylinux-aarch64: + needs: [generate-license, lint-rust, lint-python] + name: ManyLinux arm64 + runs-on: ubuntu-24.04-arm + steps: + - uses: actions/checkout@v6 + + - run: rm LICENSE.txt + - name: Download LICENSE.txt + uses: actions/download-artifact@v7 + with: + name: python-wheel-license + path: . + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Build (release mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'release' + with: + target: aarch64-unknown-linux-gnu + manylinux: "2_28" + args: --release --strip --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Build (debug mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'debug' + with: + target: aarch64-unknown-linux-gnu + manylinux: "2_28" + args: --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Archive wheels + uses: actions/upload-artifact@v6 + if: inputs.build_mode == 'release' + with: + name: dist-manylinux-aarch64 + path: dist/* + + # ============================================ + # Build - macOS arm64 / Windows + # ============================================ build-python-mac-win: - needs: [generate-license] - name: Mac/Win + needs: [generate-license, lint-rust, lint-python] + name: macOS arm64 & Windows runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -77,35 +264,49 @@ jobs: python-version: ["3.10"] os: [macos-latest, windows-latest] steps: - - uses: actions/checkout@v5 - - - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v7 with: name: python-wheel-license path: . + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + - name: Install Protoc uses: arduino/setup-protoc@v3 with: version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} - - uses: astral-sh/setup-uv@v7 - with: - enable-cache: true + - name: Install dependencies + run: uv sync --dev --no-install-package datafusion - - name: Build Python package - run: | - uv sync --dev --no-install-package datafusion - uv run --no-project maturin build --release --strip --features substrait + # Run clippy BEFORE maturin so we can avoid rebuilding. The features must match + # exactly the features used by maturin. Linux maturin builds need to happen in a + # container so only run this for our mac runner. + - name: Run Clippy + if: matrix.os != 'windows-latest' + run: cargo clippy --no-deps --all-targets --features substrait -- -D warnings + + - name: Build Python package (release mode) + if: inputs.build_mode == 'release' + run: uv run --no-project maturin build --release --strip --features substrait + + - name: Build Python package (debug mode) + if: inputs.build_mode != 'release' + run: uv run --no-project maturin build --features substrait - name: List Windows wheels if: matrix.os == 'windows-latest' @@ -119,127 +320,80 @@ jobs: run: find target/wheels/ - name: Archive wheels - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 + if: inputs.build_mode == 'release' with: name: dist-${{ matrix.os }} path: target/wheels/* + # ============================================ + # Build - macOS x86_64 (release only) + # ============================================ build-macos-x86_64: - needs: [generate-license] - name: Mac x86_64 + if: inputs.build_mode == 'release' + needs: [generate-license, lint-rust, lint-python] runs-on: macos-15-intel strategy: fail-fast: false matrix: python-version: ["3.10"] steps: - - uses: actions/checkout@v5 - - - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} + - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v7 with: name: python-wheel-license path: . + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + - name: Install Protoc uses: arduino/setup-protoc@v3 with: version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} - - uses: astral-sh/setup-uv@v7 - with: - enable-cache: true + - name: Install dependencies + run: uv sync --dev --no-install-package datafusion - - name: Build Python package + - name: Build (release mode) run: | - uv sync --dev --no-install-package datafusion uv run --no-project maturin build --release --strip --features substrait - name: List Mac wheels run: find target/wheels/ - name: Archive wheels - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 with: name: dist-macos-aarch64 path: target/wheels/* - build-manylinux-x86_64: - needs: [generate-license] - name: Manylinux x86_64 - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - - run: rm LICENSE.txt - - name: Download LICENSE.txt - uses: actions/download-artifact@v5 - with: - name: python-wheel-license - path: . - - run: cat LICENSE.txt - - name: Build wheels - uses: PyO3/maturin-action@v1 - env: - RUST_BACKTRACE: 1 - with: - rust-toolchain: nightly - target: x86_64 - manylinux: auto - rustup-components: rust-std rustfmt # Keep them in one line due to https://github.com/PyO3/maturin-action/issues/153 - args: --release --manylinux 2014 --features protoc,substrait - - name: Archive wheels - uses: actions/upload-artifact@v4 - with: - name: dist-manylinux-x86_64 - path: target/wheels/* - - build-manylinux-aarch64: - needs: [generate-license] - name: Manylinux arm64 - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v5 - - run: rm LICENSE.txt - - name: Download LICENSE.txt - uses: actions/download-artifact@v5 - with: - name: python-wheel-license - path: . - - run: cat LICENSE.txt - - name: Build wheels - uses: PyO3/maturin-action@v1 - env: - RUST_BACKTRACE: 1 - with: - rust-toolchain: nightly - target: aarch64 - # Use manylinux_2_28-cross because the manylinux2014-cross has GCC 4.8.5, which causes the build to fail - manylinux: 2_28 - rustup-components: rust-std rustfmt # Keep them in one line due to https://github.com/PyO3/maturin-action/issues/153 - args: --release --features protoc,substrait - - name: Archive wheels - uses: actions/upload-artifact@v4 - with: - name: dist-manylinux-aarch64 - path: target/wheels/* + # ============================================ + # Build - Source Distribution + # ============================================ build-sdist: needs: [generate-license] name: Source distribution + if: inputs.build_mode == 'release' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v7 with: name: python-wheel-license path: . @@ -253,16 +407,22 @@ jobs: args: --release --sdist --out dist --features protoc,substrait - name: Assert sdist build does not generate wheels run: | - if [ "$(ls -A target/wheels)" ]; then - echo "Error: Sdist build generated wheels" - exit 1 - else - echo "Directory is clean" - fi + if [ "$(ls -A target/wheels)" ]; then + echo "Error: Sdist build generated wheels" + exit 1 + else + echo "Directory is clean" + fi shell: bash - + + # ============================================ + # Build - Source Distribution + # ============================================ + merge-build-artifacts: runs-on: ubuntu-latest + name: Merge build artifacts + if: inputs.build_mode == 'release' needs: - build-python-mac-win - build-macos-x86_64 @@ -271,11 +431,14 @@ jobs: - build-sdist steps: - name: Merge Build Artifacts - uses: actions/upload-artifact/merge@v4 + uses: actions/upload-artifact/merge@v6 with: name: dist pattern: dist-* + # ============================================ + # Build - Documentation + # ============================================ # Documentation build job that runs after wheels are built build-docs: name: Build docs @@ -299,11 +462,11 @@ jobs: fi - name: Checkout docs sources - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Checkout docs target branch if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') - uses: actions/checkout@v5 + uses: actions/checkout@v6 with: fetch-depth: 0 ref: ${{ steps.target-branch.outputs.value }} @@ -312,7 +475,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v6 with: - python-version: "3.11" + python-version: "3.10" - name: Install dependencies uses: astral-sh/setup-uv@v7 @@ -321,25 +484,26 @@ jobs: # Download the Linux wheel built in the previous job - name: Download pre-built Linux wheel - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v7 with: name: dist-manylinux-x86_64 path: wheels/ - # Install from the pre-built wheel - - name: Install from pre-built wheel + # Install from the pre-built wheels + - name: Install from pre-built wheels run: | set -x uv venv # Install documentation dependencies uv sync --dev --no-install-package datafusion --group docs - # Install the pre-built wheel - WHEEL=$(find wheels/ -name "*.whl" | head -1) - if [ -n "$WHEEL" ]; then - echo "Installing wheel: $WHEEL" - uv pip install "$WHEEL" + # Install all pre-built wheels + WHEELS=$(find wheels/ -name "*.whl") + if [ -n "$WHEELS" ]; then + echo "Installing wheels:" + echo "$WHEELS" + uv pip install wheels/*.whl else - echo "ERROR: No wheel found!" + echo "ERROR: No wheels found!" exit 1 fi @@ -368,16 +532,3 @@ jobs: git commit -m 'Publish built docs triggered by ${{ github.sha }}' git push || git push --force fi - - # NOTE: PyPI publish needs to be done manually for now after release passed the vote - # release: - # name: Publish in PyPI - # needs: [build-manylinux, build-python-mac-win] - # runs-on: ubuntu-latest - # steps: - # - uses: actions/download-artifact@v5 - # - name: Publish to PyPI - # uses: pypa/gh-action-pypi-publish@master - # with: - # user: __token__ - # password: ${{ secrets.pypi_password }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..ab284b522 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# CI workflow for pull requests - runs tests in DEBUG mode for faster feedback + +name: CI + +on: + pull_request: + branches: ["main"] + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + build: + uses: ./.github/workflows/build.yml + with: + build_mode: debug + run_wheels: false + secrets: inherit + + test: + needs: build + uses: ./.github/workflows/test.yml + secrets: inherit diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index ac45e9fdf..2c8ecbc5e 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -25,10 +25,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v5 + uses: actions/checkout@v6 - name: Setup Python uses: actions/setup-python@v6 with: - python-version: "3.10" + python-version: "3.14" - name: Audit licenses run: ./dev/release/run-rat.sh . diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..bddc89eac --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Release workflow - runs tests in RELEASE mode and builds distribution wheels +# Triggered on: +# - Merges to main +# - Release candidate tags (*-rc*) +# - Release tags (e.g., 45.0.0) + +name: Release Build + +on: + push: + branches: + - "main" + tags: + - "*-rc*" # Release candidates (e.g., 45.0.0-rc1) + - "[0-9]+.*" # Release tags (e.g., 45.0.0) + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + build: + uses: ./.github/workflows/build.yml + with: + build_mode: release + run_wheels: true + secrets: inherit + + test: + needs: build + uses: ./.github/workflows/test.yml + secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yml similarity index 61% rename from .github/workflows/test.yaml rename to .github/workflows/test.yml index 4bc898683..a2f304aa5 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yml @@ -15,16 +15,13 @@ # specific language governing permissions and limitations # under the License. -name: Python test -on: - push: - branches: [main] - pull_request: - branches: [main] +# Reusable workflow for running tests +# This ensures the same tests run for both debug (PRs) and release (main/tags) builds + +name: Test -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true +on: + workflow_call: jobs: test-matrix: @@ -33,28 +30,16 @@ jobs: fail-fast: false matrix: python-version: - - "3.9" - "3.10" - "3.11" - "3.12" - "3.13" + - "3.14" toolchain: - "stable" steps: - - uses: actions/checkout@v5 - - - name: Setup Rust Toolchain - uses: dtolnay/rust-toolchain@stable - id: rust-toolchain - with: - components: clippy,rustfmt - - - name: Install Protoc - uses: arduino/setup-protoc@v3 - with: - version: '27.4' - repo-token: ${{ secrets.GITHUB_TOKEN }} + - uses: actions/checkout@v6 - name: Setup Python uses: actions/setup-python@v6 @@ -62,42 +47,63 @@ jobs: python-version: ${{ matrix.python-version }} - name: Cache Cargo - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: ~/.cargo - key: cargo-cache-${{ steps.rust-toolchain.outputs.cachekey }}-${{ hashFiles('Cargo.lock') }} + key: cargo-cache-${{ matrix.toolchain }}-${{ hashFiles('Cargo.lock') }} - - name: Check Formatting - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: cargo fmt -- --check + - name: Install dependencies + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true - - name: Run Clippy - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: cargo clippy --all-targets --all-features -- -D clippy::all -D warnings -A clippy::redundant_closure + # Download the Linux wheel built in the build workflow + - name: Download pre-built Linux wheel + uses: actions/download-artifact@v7 + with: + name: dist-manylinux-x86_64 + path: wheels/ - - name: Install dependencies and build - uses: astral-sh/setup-uv@v7 + # Download the FFI test wheel + - name: Download pre-built FFI test wheel + uses: actions/download-artifact@v7 with: - enable-cache: true + name: test-ffi-manylinux-x86_64 + path: wheels/ + + # Install from the pre-built wheels + - name: Install from pre-built wheels + run: | + set -x + uv venv + # Install development dependencies + uv sync --dev --no-install-package datafusion + # Install all pre-built wheels + WHEELS=$(find wheels/ -name "*.whl") + if [ -n "$WHEELS" ]; then + echo "Installing wheels:" + echo "$WHEELS" + uv pip install wheels/*.whl + else + echo "ERROR: No wheels found!" + exit 1 + fi - name: Run tests env: RUST_BACKTRACE: 1 run: | git submodule update --init - uv sync --dev --no-install-package datafusion - uv run --no-project maturin develop --uv - uv run --no-project pytest -v . + uv run --no-project pytest -v --import-mode=importlib - name: FFI unit tests run: | cd examples/datafusion-ffi-example - uv run --no-project maturin develop --uv uv run --no-project pytest python/tests/_test*.py - name: Cache the generated dataset id: cache-tpch-dataset - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: benchmarks/tpch/data key: tpch-data-2.18.0 diff --git a/.github/workflows/verify-release-candidate.yml b/.github/workflows/verify-release-candidate.yml new file mode 100644 index 000000000..a10a4faa9 --- /dev/null +++ b/.github/workflows/verify-release-candidate.yml @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Verify Release Candidate + +# NOTE: This workflow is intended to be run manually via workflow_dispatch. + +on: + workflow_dispatch: + inputs: + version: + description: Version number (e.g., 52.0.0) + required: true + type: string + rc_number: + description: Release candidate number (e.g., 0) + required: true + type: string + +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + verify: + name: Verify RC (${{ matrix.os }}-${{ matrix.arch }}) + strategy: + fail-fast: false + matrix: + include: + # Linux + - os: linux + arch: x64 + runner: ubuntu-latest + - os: linux + arch: arm64 + runner: ubuntu-24.04-arm + + # macOS + - os: macos + arch: arm64 + runner: macos-latest + - os: macos + arch: x64 + runner: macos-15-intel + + # Windows + - os: windows + arch: x64 + runner: windows-latest + runs-on: ${{ matrix.runner }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set up protoc + uses: arduino/setup-protoc@v3 + with: + version: "27.4" + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Run release candidate verification + shell: bash + run: ./dev/release/verify-release-candidate.sh "${{ inputs.version }}" "${{ inputs.rc_number }}" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e8c451262..8ae6a4e32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: - id: actionlint-docker - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.9.10 + rev: v0.15.1 hooks: # Run the linter. - id: ruff @@ -33,7 +33,7 @@ repos: - id: rust-fmt name: Rust fmt description: Run cargo fmt on files included in the commit. rustfmt should be installed before-hand. - entry: cargo fmt --all -- + entry: cargo +nightly fmt --all -- pass_filenames: true types: [file, rust] language: system diff --git a/Cargo.lock b/Cargo.lock index 558b044aa..e44c84b97 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "abi_stable" @@ -50,15 +50,6 @@ dependencies = [ "core_extensions", ] -[[package]] -name = "addr2line" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" -dependencies = [ - "gimli", -] - [[package]] name = "adler2" version = "2.0.1" @@ -73,7 +64,7 @@ checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "version_check", "zerocopy", @@ -81,9 +72,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -120,21 +111,22 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.99" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" +checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" [[package]] name = "apache-avro" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7" +checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf" dependencies = [ "bigdecimal", "bon", - "bzip2 0.6.0", + "bzip2", "crc32fast", "digest", + "liblzma", "log", "miniz_oxide", "num-bigint", @@ -145,19 +137,30 @@ dependencies = [ "serde_bytes", "serde_json", "snap", - "strum 0.27.2", - "strum_macros 0.27.2", + "strum", + "strum_macros", "thiserror", "uuid", - "xz2", "zstd", ] +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + [[package]] name = "arc-swap" -version = "1.7.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" +checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +dependencies = [ + "rustversion", +] [[package]] name = "arrayref" @@ -173,9 +176,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +checksum = "602268ce9f569f282cedb9a9f6bac569b680af47b9b077d515900c03c5d190da" dependencies = [ "arrow-arith", "arrow-array", @@ -195,23 +198,23 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +checksum = "cd53c6bf277dea91f136ae8e3a5d7041b44b5e489e244e637d00ae302051f56f" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +checksum = "e53796e07a6525edaf7dc28b540d477a934aff14af97967ad1d5550878969b9e" dependencies = [ "ahash", "arrow-buffer", @@ -220,47 +223,51 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.16.0", - "num", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +checksum = "f2c1a85bb2e94ee10b76531d8bc3ce9b7b4c0d508cabfb17d477f63f2617bd20" dependencies = [ "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +checksum = "89fb245db6b0e234ed8e15b644edb8664673fefe630575e94e62cd9d489a8a26" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ord", "arrow-schema", "arrow-select", "atoi", - "base64 0.22.1", + "base64", "chrono", "comfy-table", "half", "lexical-core", - "num", + "num-traits", "ryu", ] [[package]] name = "arrow-csv" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +checksum = "d374882fb465a194462527c0c15a93aa19a554cf690a6b77a26b2a02539937a7" dependencies = [ "arrow-array", "arrow-cast", @@ -273,21 +280,22 @@ dependencies = [ [[package]] name = "arrow-data" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +checksum = "189d210bc4244c715fa3ed9e6e22864673cccb73d5da28c2723fb2e527329b33" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-ipc" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +checksum = "7968c2e5210c41f4909b2ef76f6e05e172b99021c2def5edf3cc48fdd39d1d6c" dependencies = [ "arrow-array", "arrow-buffer", @@ -301,9 +309,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +checksum = "92111dba5bf900f443488e01f00d8c4ddc2f47f5c50039d18120287b580baa22" dependencies = [ "arrow-array", "arrow-buffer", @@ -313,19 +321,21 @@ dependencies = [ "chrono", "half", "indexmap", + "itoa", "lexical-core", "memchr", - "num", - "serde", + "num-traits", + "ryu", + "serde_core", "serde_json", "simdutf8", ] [[package]] name = "arrow-ord" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +checksum = "211136cb253577ee1a6665f741a13136d4e563f64f5093ffd6fb837af90b9495" dependencies = [ "arrow-array", "arrow-buffer", @@ -336,9 +346,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d924b32e96f8bb74d94cd82bd97b313c432fcb0ea331689ef9e7c6b8be4b258" +checksum = "205437da4c0877c756c81bfe847a621d0a740cd00a155109d65510a1a62ebcd9" dependencies = [ "arrow-array", "arrow-data", @@ -348,9 +358,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +checksum = "8e0f20145f9f5ea3fe383e2ba7a7487bf19be36aa9dbf5dd6a1f92f657179663" dependencies = [ "arrow-array", "arrow-buffer", @@ -361,34 +371,34 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +checksum = "1b47e0ca91cc438d2c7879fe95e0bca5329fff28649e30a88c6f760b1faeddcb" dependencies = [ "bitflags", - "serde", + "serde_core", "serde_json", ] [[package]] name = "arrow-select" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +checksum = "750a7d1dda177735f5e82a314485b6915c7cccdbb278262ac44090f4aba4a325" dependencies = [ "ahash", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "56.2.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +checksum = "e1eab1208bc4fe55d768cdc9b9f3d9df5a794cdb3ee2586bf89f9b30dc31ad8c" dependencies = [ "arrow-array", "arrow-buffer", @@ -396,7 +406,7 @@ dependencies = [ "arrow-schema", "arrow-select", "memchr", - "num", + "num-traits", "regex", "regex-syntax", ] @@ -415,19 +425,14 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.19" +version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +checksum = "7d67d43201f4d20c78bcda740c142ca52482d81da80681533d33bf3f0596c8e2" dependencies = [ - "bzip2 0.5.2", - "flate2", - "futures-core", - "memchr", + "compression-codecs", + "compression-core", "pin-project-lite", "tokio", - "xz2", - "zstd", - "zstd-safe", ] [[package]] @@ -447,7 +452,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -458,7 +463,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -482,27 +487,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" -[[package]] -name = "backtrace" -version = "0.3.75" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" -dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-targets 0.52.6", -] - -[[package]] -name = "base64" -version = "0.21.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" - [[package]] name = "base64" version = "0.22.1" @@ -511,9 +495,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bigdecimal" -version = "0.4.8" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" dependencies = [ "autocfg", "libm", @@ -525,9 +509,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.9.4" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "blake2" @@ -540,15 +524,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.8.2" +version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", + "cpufeatures", ] [[package]] @@ -562,9 +547,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.7.2" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2529c31017402be841eb45892278a6c21a000c0a17643af326c73a73f83f0fb" +checksum = "2d13a61f2963b88eef9c1be03df65d42f6996dfeac1054870d950fcf66686f83" dependencies = [ "bon-macros", "rustversion", @@ -572,9 +557,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.7.2" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005" +checksum = "d314cc62af2b6b0c65780555abb4d02a03dd3b799cd42419044f0c38d99738c0" dependencies = [ "darling", "ident_case", @@ -582,7 +567,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -608,9 +593,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.0" +version = "3.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +checksum = "5c6f81257d10a0f602a294ae4182251151ff97dbb504ef9afcdda4a64b24d9b4" [[package]] name = "byteorder" @@ -620,43 +605,24 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" - -[[package]] -name = "bzip2" -version = "0.5.2" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "bzip2" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bea8dcd42434048e4f7a304411d9273a411f647446c1234a65ce0554923f4cff" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "cc" -version = "1.2.37" +version = "1.2.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65193589c6404eb80b450d618eaf9a2cafaaafd57ecce47370519ef674a7bd44" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" dependencies = [ "find-msvc-tools", "jobserver", @@ -666,9 +632,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "cfg_aliases" @@ -678,14 +644,14 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", "num-traits", "serde", - "windows-link 0.2.0", + "windows-link", ] [[package]] @@ -700,24 +666,44 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.54" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" dependencies = [ "cc", ] [[package]] name = "comfy-table" -version = "7.1.2" +version = "7.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" dependencies = [ - "strum 0.26.3", - "strum_macros 0.26.4", + "unicode-segmentation", "unicode-width", ] +[[package]] +name = "compression-codecs" +version = "0.4.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" + [[package]] name = "const-random" version = "0.1.18" @@ -733,7 +719,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom 0.2.16", + "getrandom 0.2.17", "once_cell", "tiny-keccak", ] @@ -749,9 +735,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" [[package]] name = "core-foundation" @@ -825,40 +811,50 @@ checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-common" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", "typenum", ] +[[package]] +name = "cstr" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68523903c8ae5aacfa32a0d9ae60cadeb764e1da14ee0d26b1f3089f13a54636" +dependencies = [ + "proc-macro2", + "quote", +] + [[package]] name = "csv" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" dependencies = [ "csv-core", "itoa", "ryu", - "serde", + "serde_core", ] [[package]] name = "csv-core" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" dependencies = [ "memchr", ] [[package]] name = "darling" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ "darling_core", "darling_macro", @@ -866,27 +862,26 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "fnv", "ident_case", "proc-macro2", "quote", "strsim", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "darling_macro" -version = "0.21.3" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -905,22 +900,21 @@ dependencies = [ [[package]] name = "datafusion" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc6759cf9ef57c5c469e4027ac4b4cfa746e06a0f5472c2b922b6a403c2a64c4" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", - "arrow-ipc", "arrow-schema", "async-trait", "bytes", - "bzip2 0.6.0", + "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", + "datafusion-datasource-arrow", "datafusion-datasource-avro", "datafusion-datasource-csv", "datafusion-datasource-json", @@ -943,7 +937,8 @@ dependencies = [ "datafusion-sql", "flate2", "futures", - "itertools 0.14.0", + "itertools", + "liblzma", "log", "object_store", "parking_lot", @@ -955,15 +950,13 @@ dependencies = [ "tokio", "url", "uuid", - "xz2", "zstd", ] [[package]] name = "datafusion-catalog" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a1c48fc7e6d62590d45f7be7c531980b8ff091d1ab113a9ddf465bef41e4093" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-trait", @@ -976,9 +969,8 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-plan", "datafusion-session", - "datafusion-sql", "futures", - "itertools 0.14.0", + "itertools", "log", "object_store", "parking_lot", @@ -987,9 +979,8 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3db1266da115de3ab0b2669fc027d96cf0ff777deb3216d52c74b528446ccdd6" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-trait", @@ -999,30 +990,29 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-plan", - "datafusion-session", "futures", + "itertools", "log", "object_store", - "tokio", ] [[package]] name = "datafusion-common" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad4eb2a48ca10fa1e1a487a28a5bf080e31efac2d4bf12bb7e92c2d9ea4f35e5" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "ahash", "apache-avro", "arrow", "arrow-ipc", - "base64 0.22.1", "chrono", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap", + "itertools", "libc", "log", "object_store", @@ -1036,9 +1026,8 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0422ee64d5791599c46b786063e695f7699fadd3a12ad25038cb3094d05886a" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "futures", "log", @@ -1047,15 +1036,14 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904c2e1089b3ccf10786f2dae12bc560fda278e4194a8917c5844d2e8c212818" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2 0.6.0", + "bzip2", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -1069,60 +1057,72 @@ dependencies = [ "flate2", "futures", "glob", - "itertools 0.14.0", + "itertools", + "liblzma", "log", "object_store", - "parquet", "rand", - "tempfile", "tokio", "tokio-util", "url", - "xz2", "zstd", ] +[[package]] +name = "datafusion-datasource-arrow" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", + "object_store", + "tokio", +] + [[package]] name = "datafusion-datasource-avro" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1c87828da05c4115935af3394b27499cd2bd91bcb846a928209650627bf7f93" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "apache-avro", "arrow", "async-trait", "bytes", - "chrono", - "datafusion-catalog", "datafusion-common", "datafusion-datasource", - "datafusion-execution", - "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", "futures", "num-traits", "object_store", - "tokio", ] [[package]] name = "datafusion-datasource-csv" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8336a805c42ef4e359daaad142ddc53649f23c7e934c117d8516816afe6b7a3d" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", @@ -1134,20 +1134,17 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c691b1565e245ea369bc8418b472a75ea84c2ad2deb61b1521cfa38319a9cd47" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", @@ -1155,58 +1152,56 @@ dependencies = [ "object_store", "serde_json", "tokio", + "tokio-stream", ] [[package]] name = "datafusion-datasource-parquet" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9f7576ceb5974c5f6874d7f2a5ebfeb58960a920da64017def849e0352fe2d8" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog", "datafusion-common", "datafusion-common-runtime", "datafusion-datasource", "datafusion-execution", "datafusion-expr", - "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", "datafusion-physical-expr", "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", - "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-pruning", "datafusion-session", "futures", - "itertools 0.14.0", + "itertools", "log", "object_store", "parking_lot", "parquet", - "rand", "tokio", ] [[package]] name = "datafusion-doc" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dde7c10244f3657fc01eef8247c0b2b20eae4cf6439a0ebb27322f32026d6b8" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" [[package]] name = "datafusion-execution" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5143fc795cef959b6d5271b2e8f1120382fe929fc4bd027c7d7b993f5352ef7e" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", + "arrow-buffer", "async-trait", + "chrono", "dashmap", "datafusion-common", "datafusion-expr", + "datafusion-physical-expr-common", "futures", "log", "object_store", @@ -1218,9 +1213,8 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63e826296bc5f5d0af3e39c1af473d4091ac6a152a5be2f80c256f0182938428" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-trait", @@ -1232,6 +1226,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "indexmap", + "itertools", "paste", "recursive", "serde_json", @@ -1240,32 +1235,38 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9096732d0d8862d1950ca70324fe91f9dee3799eeb0db53ef452bdb573484db6" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "datafusion-common", "indexmap", - "itertools 0.14.0", + "itertools", "paste", ] [[package]] name = "datafusion-ffi" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4bef25e2b86d9921f7a98b1a86bfb50cebe2fd97f3a9b96c85ce475e9ef78b0" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "abi_stable", "arrow", "arrow-schema", "async-ffi", "async-trait", - "datafusion", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", "datafusion-proto", "datafusion-proto-common", + "datafusion-session", "futures", "log", "prost", @@ -1273,18 +1274,38 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-ffi-example" +version = "52.0.0" +dependencies = [ + "arrow", + "arrow-array", + "arrow-schema", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-ffi", + "datafusion-functions-aggregate", + "datafusion-functions-window", + "datafusion-python-util", + "pyo3", + "pyo3-build-config", + "pyo3-log", +] + [[package]] name = "datafusion-functions" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f362c78ac283e64fd3976e060c1a8a57d5f4dcf844a6b6bd2eb320640a1572e" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "arrow-buffer", - "base64 0.22.1", + "base64", "blake2", "blake3", "chrono", + "chrono-tz", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1292,9 +1313,11 @@ dependencies = [ "datafusion-expr-common", "datafusion-macros", "hex", - "itertools 0.14.0", + "itertools", "log", "md-5", + "memchr", + "num-traits", "rand", "regex", "sha2", @@ -1304,9 +1327,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22e2a80a80145a796ae3f02eb724ac516178556aec864fe89f6ab3741a4cd249" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "ahash", "arrow", @@ -1320,14 +1342,14 @@ dependencies = [ "datafusion-physical-expr-common", "half", "log", + "num-traits", "paste", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7dcca2fe7c33409e9ab3f950366aa4cba5db6175a09599fdb658ad9f2cc4296" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "ahash", "arrow", @@ -1338,9 +1360,8 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1b298733377f3ec8c2868c75b5555b15396d9c13e36c5fda28e80feee34e3ed" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "arrow-ord", @@ -1348,21 +1369,23 @@ dependencies = [ "datafusion-doc", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-aggregate-common", "datafusion-macros", "datafusion-physical-expr-common", - "itertools 0.14.0", + "hashbrown 0.16.1", + "itertools", + "itoa", "log", "paste", ] [[package]] name = "datafusion-functions-table" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fa4a380ca362eb0fbd33093e8ca6b7a31057616c7e6ee999b87a4ad3c7c0b3f" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "async-trait", @@ -1376,9 +1399,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9068fc85b8e187c706427794d79bb7ee91132b6b192cb7b18e650a5f7c5c1340" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "datafusion-common", @@ -1394,9 +1416,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2f80ec56e177d166269556649be817a382a374642872df4ca48cf9be3d09b3a" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1404,20 +1425,18 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4868fe261ba01e462033eff141e90453b7630722cad6420fddd81ebb786f6e2" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ - "datafusion-expr", + "datafusion-doc", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "datafusion-optimizer" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40ed8c51b5c37c057e5c7d5945ed807f1cecfba003bdb1a4c3036595dda287c7" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "chrono", @@ -1426,7 +1445,7 @@ dependencies = [ "datafusion-expr-common", "datafusion-physical-expr", "indexmap", - "itertools 0.14.0", + "itertools", "log", "recursive", "regex", @@ -1435,9 +1454,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f678f5734147446e1adbee63be4b244c8f0e9cbd5c41525004ace3730190d03e" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "ahash", "arrow", @@ -1447,20 +1465,20 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap", - "itertools 0.14.0", - "log", + "itertools", "parking_lot", "paste", - "petgraph 0.8.2", + "petgraph", + "recursive", + "tokio", ] [[package]] name = "datafusion-physical-expr-adapter" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "086877d4eca538e9cd1f28b917db0036efe0ad8b4fb7c702f520510672032c8d" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "datafusion-common", @@ -1468,28 +1486,29 @@ dependencies = [ "datafusion-functions", "datafusion-physical-expr", "datafusion-physical-expr-common", - "itertools 0.14.0", + "itertools", ] [[package]] name = "datafusion-physical-expr-common" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5c5d17f6a4f28f9849ee3449bb9b83406a718e4275c218bf37ca247ee123779" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "ahash", "arrow", + "chrono", "datafusion-common", "datafusion-expr-common", - "hashbrown 0.14.5", - "itertools 0.14.0", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "parking_lot", ] [[package]] name = "datafusion-physical-optimizer" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab9fb8b3fba2634d444e0177862797dc1231e0e20bc4db291a15d39c0d4136c3" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "datafusion-common", @@ -1500,37 +1519,36 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-pruning", - "itertools 0.14.0", - "log", + "itertools", "recursive", ] [[package]] name = "datafusion-physical-plan" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5086cb2e579270173ff0eb38d60ba2a081f1d422a743fa673f6096920950eb5" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "ahash", "arrow", "arrow-ord", "arrow-schema", "async-trait", - "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap", - "itertools 0.14.0", + "itertools", "log", + "num-traits", "parking_lot", "pin-project-lite", "tokio", @@ -1538,25 +1556,35 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87c686bfd29ec5362fe229247ef03a0beb063b50e307bf72d0f1a80b9d90f8b8" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "chrono", - "datafusion", + "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", "datafusion-expr", + "datafusion-functions-table", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", "datafusion-proto-common", "object_store", "prost", + "rand", ] [[package]] name = "datafusion-proto-common" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1972d37680d48d4f6167b535e0a23ea9f814a21e1359d0bd5c30d1345b95aef9" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "datafusion-common", @@ -1565,31 +1593,32 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f84b866d906118c320459f30385048aeedbe36ac06973d3e4fa0cc5d60d722c" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-datasource", "datafusion-expr-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "itertools 0.14.0", + "itertools", "log", ] [[package]] name = "datafusion-python" -version = "50.0.0" +version = "52.0.0" dependencies = [ "arrow", + "arrow-select", "async-trait", + "cstr", "datafusion", "datafusion-ffi", "datafusion-proto", + "datafusion-python-util", "datafusion-substrait", "futures", "log", @@ -1602,45 +1631,48 @@ dependencies = [ "pyo3-async-runtimes", "pyo3-build-config", "pyo3-log", + "serde_json", "tokio", "url", "uuid", ] [[package]] -name = "datafusion-session" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3820062b9dd2846954eeb844ff9fe3662977b7d2d74947647c779fabfa502508" +name = "datafusion-python-util" +version = "52.0.0" dependencies = [ "arrow", + "datafusion", + "datafusion-ffi", + "prost", + "pyo3", + "tokio", +] + +[[package]] +name = "datafusion-session" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ "async-trait", - "dashmap", "datafusion-common", - "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", - "datafusion-physical-expr", "datafusion-physical-plan", - "datafusion-sql", - "futures", - "itertools 0.14.0", - "log", - "object_store", "parking_lot", - "tokio", ] [[package]] name = "datafusion-sql" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "375232baa851b2e9d09fcbe8906141a0ec6e0e058addc5565e0d3d790bb9d51d" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "arrow", "bigdecimal", + "chrono", "datafusion-common", "datafusion-expr", + "datafusion-functions-nested", "indexmap", "log", "recursive", @@ -1650,15 +1682,15 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "50.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd3cff6283a943da81d5c89a3ada9ac5c4aa1230ab9ab2d7a95bc7b035dacdab" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "async-recursion", "async-trait", "chrono", "datafusion", - "itertools 0.14.0", + "half", + "itertools", "object_store", "pbjson-types", "prost", @@ -1686,7 +1718,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -1714,7 +1746,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -1725,9 +1757,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "find-msvc-tools" -version = "0.1.1" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "fixedbitset" @@ -1737,9 +1769,9 @@ checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "25.2.10" +version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ "bitflags", "rustc_version", @@ -1747,13 +1779,13 @@ dependencies = [ [[package]] name = "flate2" -version = "1.1.2" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", - "libz-rs-sys", "miniz_oxide", + "zlib-rs", ] [[package]] @@ -1768,6 +1800,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -1779,9 +1817,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" dependencies = [ "futures-channel", "futures-core", @@ -1794,9 +1832,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", "futures-sink", @@ -1804,15 +1842,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" [[package]] name = "futures-executor" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" dependencies = [ "futures-core", "futures-task", @@ -1821,38 +1859,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" [[package]] name = "futures-macro" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "futures-sink" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "futures-task" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" [[package]] name = "futures-util" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-channel", "futures-core", @@ -1862,7 +1900,6 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", - "pin-utils", "slab", ] @@ -1887,36 +1924,43 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "js-sys", "libc", "r-efi", - "wasi 0.14.7+wasi-0.2.4", + "wasip2", "wasm-bindgen", ] [[package]] -name = "gimli" -version = "0.31.1" +name = "getrandom" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", + "wasip3", +] [[package]] name = "glob" @@ -1926,9 +1970,9 @@ checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "h2" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" dependencies = [ "atomic-waker", "bytes", @@ -1945,13 +1989,14 @@ dependencies = [ [[package]] name = "half" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", "num-traits", + "zerocopy", ] [[package]] @@ -1959,10 +2004,6 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" -dependencies = [ - "ahash", - "allocator-api2", -] [[package]] name = "hashbrown" @@ -1970,16 +2011,19 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "allocator-api2", - "equivalent", - "foldhash", + "foldhash 0.1.5", ] [[package]] name = "hashbrown" -version = "0.16.0" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -1995,12 +2039,11 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "http" -version = "1.3.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" dependencies = [ "bytes", - "fnv", "itoa", ] @@ -2041,9 +2084,9 @@ checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" dependencies = [ "atomic-waker", "bytes", @@ -2080,14 +2123,13 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.17" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ - "base64 0.22.1", + "base64", "bytes", "futures-channel", - "futures-core", "futures-util", "http", "http-body", @@ -2104,9 +2146,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.64" +version = "0.1.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -2128,9 +2170,9 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ "displaydoc", "potential_utf", @@ -2141,9 +2183,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ "displaydoc", "litemap", @@ -2154,11 +2196,10 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" dependencies = [ - "displaydoc", "icu_collections", "icu_normalizer_data", "icu_properties", @@ -2169,42 +2210,38 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] name = "icu_properties" -version = "2.0.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" dependencies = [ - "displaydoc", "icu_collections", "icu_locale_core", "icu_properties_data", "icu_provider", - "potential_utf", "zerotrie", "zerovec", ] [[package]] name = "icu_properties_data" -version = "2.0.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" [[package]] name = "icu_provider" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ "displaydoc", "icu_locale_core", - "stable_deref_trait", - "tinystr", "writeable", "yoke", "zerofrom", @@ -2212,6 +2249,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "ident_case" version = "1.0.1" @@ -2241,37 +2284,22 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.11.3" +version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92119844f513ffa41556430369ab02c295a3578af21cf945caa3e9e0c2481ac3" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", - "hashbrown 0.15.5", + "hashbrown 0.16.1", + "serde", + "serde_core", ] -[[package]] -name = "indoc" -version = "2.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" - [[package]] name = "integer-encoding" version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "io-uring" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" -dependencies = [ - "bitflags", - "cfg-if", - "libc", -] - [[package]] name = "ipnet" version = "2.11.0" @@ -2280,23 +2308,14 @@ checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" -version = "0.7.8" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" dependencies = [ "memchr", "serde", ] -[[package]] -name = "itertools" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.14.0" @@ -2308,9 +2327,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.15" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jobserver" @@ -2318,25 +2337,31 @@ version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "libc", ] [[package]] name = "js-sys" -version = "0.3.78" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0b063578492ceec17683ef2f8c5e89121fbd0b172cbc280635ab7567db2738" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", ] +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "lexical-core" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -2347,53 +2372,46 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" dependencies = [ "lexical-parse-integer", "lexical-util", - "static_assertions", ] [[package]] name = "lexical-parse-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] name = "lexical-util" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" -dependencies = [ - "static_assertions", -] +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" [[package]] name = "lexical-write-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ "lexical-util", "lexical-write-integer", - "static_assertions", ] [[package]] name = "lexical-write-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] @@ -2404,9 +2422,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.175" +version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" [[package]] name = "libloading" @@ -2419,28 +2437,39 @@ dependencies = [ ] [[package]] -name = "libm" -version = "0.2.15" +name = "liblzma" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" +checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" +dependencies = [ + "liblzma-sys", +] [[package]] -name = "libmimalloc-sys" -version = "0.1.44" +name = "liblzma-sys" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870" +checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" dependencies = [ "cc", "libc", + "pkg-config", ] [[package]] -name = "libz-rs-sys" -version = "0.5.2" +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "libmimalloc-sys" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "840db8cf39d9ec4dd794376f38acc40d0fc65eec2a8f484f7fd375b84602becd" +checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870" dependencies = [ - "zlib-rs", + "cc", + "libc", ] [[package]] @@ -2451,9 +2480,9 @@ checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] name = "lock_api" @@ -2466,9 +2495,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.28" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lru-slab" @@ -2478,24 +2507,13 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" -version = "0.11.5" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" +checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" dependencies = [ "twox-hash", ] -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "md-5" version = "0.10.6" @@ -2508,18 +2526,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.5" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" - -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "mimalloc" @@ -2537,38 +2546,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] name = "mio" -version = "1.0.4" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "wasi 0.11.1+wasi-snapshot-preview1", - "windows-sys 0.59.0", + "wasi", + "windows-sys 0.61.2", ] [[package]] -name = "multimap" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" - -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" [[package]] name = "num-bigint" @@ -2599,28 +2595,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - [[package]] name = "num-traits" version = "0.2.19" @@ -2633,21 +2607,21 @@ dependencies = [ [[package]] name = "object" -version = "0.36.7" +version = "0.37.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.12.4" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +checksum = "c2858065e55c148d294a9f3aae3b0fa9458edadb41a108397094566f4e3c0dfb" dependencies = [ "async-trait", - "base64 0.22.1", + "base64", "bytes", "chrono", "form_urlencoded", @@ -2657,7 +2631,7 @@ dependencies = [ "httparse", "humantime", "hyper", - "itertools 0.14.0", + "itertools", "md-5", "parking_lot", "percent-encoding", @@ -2665,7 +2639,7 @@ dependencies = [ "rand", "reqwest", "ring", - "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", @@ -2686,9 +2660,9 @@ checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "openssl-probe" -version = "0.1.6" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "ordered-float" @@ -2719,37 +2693,36 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-link 0.2.0", + "windows-link", ] [[package]] name = "parquet" -version = "56.1.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b56b41d1bd36aae415e42f91cae70ee75cf6cba74416b14dce3e958d5990ec" +checksum = "3f491d0ef1b510194426ee67ddc18a9b747ef3c42050c19322a2cd2e1666c29b" dependencies = [ "ahash", "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-ipc", "arrow-schema", "arrow-select", - "base64 0.22.1", + "base64", "brotli", "bytes", "chrono", "flate2", "futures", "half", - "hashbrown 0.15.5", + "hashbrown 0.16.1", "lz4_flex", - "num", "num-bigint", + "num-integer", + "num-traits", "object_store", "paste", - "ring", "seq-macro", "simdutf8", "snap", @@ -2767,31 +2740,31 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pbjson" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e6349fa080353f4a597daffd05cb81572a9c031a6d4fff7e504947496fcc68" +checksum = "898bac3fa00d0ba57a4e8289837e965baa2dee8c3749f3b11d45a64b4223d9c3" dependencies = [ - "base64 0.21.7", + "base64", "serde", ] [[package]] name = "pbjson-build" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" +checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" dependencies = [ "heck", - "itertools 0.13.0", + "itertools", "prost", "prost-types", ] [[package]] name = "pbjson-types" -version = "0.7.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54e5e7bfb1652f95bc361d76f3c780d8e526b134b85417e774166ee941f0887" +checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526" dependencies = [ "bytes", "chrono", @@ -2810,19 +2783,9 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "petgraph" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" -dependencies = [ - "fixedbitset", - "indexmap", -] - -[[package]] -name = "petgraph" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", @@ -2868,15 +2831,15 @@ checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] name = "portable-atomic" -version = "1.11.1" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "potential_utf" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ "zerovec", ] @@ -2897,23 +2860,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "proc-macro2" -version = "1.0.101" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] [[package]] name = "prost" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" dependencies = [ "bytes", "prost-derive", @@ -2921,42 +2884,41 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.14.0", + "itertools", "log", "multimap", - "once_cell", - "petgraph 0.7.1", + "petgraph", "prettyplease", "prost", "prost-types", "regex", - "syn 2.0.106", + "syn 2.0.117", "tempfile", ] [[package]] name = "prost-derive" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "prost-types" -version = "0.13.5" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" dependencies = [ "prost", ] @@ -2972,37 +2934,36 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.26" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" dependencies = [ + "ar_archive_writer", "cc", ] [[package]] name = "pyo3" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" +checksum = "cf85e27e86080aafd5a22eae58a162e133a589551542b3e5cee4beb27e54f8e1" dependencies = [ - "indoc", "libc", - "memoffset", "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", - "unindent", ] [[package]] name = "pyo3-async-runtimes" -version = "0.25.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d73cc6b1b7d8b3cef02101d37390dbdfe7e450dfea14921cae80a9534ba59ef2" +checksum = "9e7364a95bf00e8377bbf9b0f09d7ff9715a29d8fcf93b47d1a967363b973178" dependencies = [ - "futures", + "futures-channel", + "futures-util", "once_cell", "pin-project-lite", "pyo3", @@ -3011,19 +2972,18 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" +checksum = "8bf94ee265674bf76c09fa430b0e99c26e319c945d96ca0d5a8215f31bf81cf7" dependencies = [ - "once_cell", "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" +checksum = "491aa5fc66d8059dd44a75f4580a2962c1862a1c2945359db36f6c2818b748dc" dependencies = [ "libc", "pyo3-build-config", @@ -3031,9 +2991,9 @@ dependencies = [ [[package]] name = "pyo3-log" -version = "0.12.4" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45192e5e4a4d2505587e27806c7b710c231c40c56f3bfc19535d0bb25df52264" +checksum = "26c2ec80932c5c3b2d4fbc578c9b56b2d4502098587edb8bef5b6bfcad43682e" dependencies = [ "arc-swap", "log", @@ -3042,27 +3002,27 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" +checksum = "f5d671734e9d7a43449f8480f8b38115df67bef8d21f76837fa75ee7aaa5e52e" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "pyo3-macros-backend" -version = "0.25.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" +checksum = "22faaa1ce6c430a1f71658760497291065e6450d7b5dc2bcf254d49f66ee700a" dependencies = [ "heck", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -3073,9 +3033,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.38.3" +version = "0.38.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42a232e7487fc2ef313d96dde7948e7a3c05101870d8985e4fd8d26aedd27b89" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "memchr", "serde", @@ -3108,7 +3068,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", - "getrandom 0.3.3", + "getrandom 0.3.4", "lru-slab", "rand", "ring", @@ -3138,9 +3098,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.40" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" dependencies = [ "proc-macro2", ] @@ -3173,11 +3133,11 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.9.3" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", ] [[package]] @@ -3197,23 +3157,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "redox_syscall" -version = "0.5.17" +version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.11.2" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -3223,9 +3183,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.10" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -3234,23 +3194,23 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.7" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943f41321c63ef1c92fd763bfe054d2668f7f225a5c29f0105903dc2fc04ba30" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.6" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" [[package]] name = "regress" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145bb27393fe455dd64d6cbc8d059adfa392590a45eadf079c01b11857e7b010" +checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" dependencies = [ - "hashbrown 0.15.5", + "hashbrown 0.16.1", "memchr", ] @@ -3265,11 +3225,11 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.23" +version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "base64 0.22.1", + "base64", "bytes", "futures-core", "futures-util", @@ -3313,18 +3273,12 @@ checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", "cfg-if", - "getrandom 0.2.16", + "getrandom 0.2.17", "libc", "untrusted", "windows-sys 0.52.0", ] -[[package]] -name = "rustc-demangle" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" - [[package]] name = "rustc-hash" version = "2.1.1" @@ -3342,22 +3296,22 @@ dependencies = [ [[package]] name = "rustix" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.23.31" +version = "0.23.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "once_cell", "ring", @@ -3369,9 +3323,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ "openssl-probe", "rustls-pki-types", @@ -3379,20 +3333,11 @@ dependencies = [ "security-framework", ] -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" -version = "1.12.0" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -3400,9 +3345,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.6" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8572f3c2cb9934231157b45499fc41e1f58c589fdfb81a844ba873265e80f8eb" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ "ring", "rustls-pki-types", @@ -3417,9 +3362,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.20" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "same-file" @@ -3436,7 +3381,7 @@ version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" dependencies = [ - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -3460,7 +3405,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -3471,9 +3416,9 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "3.4.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b369d18893388b345804dc0007963c99b7d665ae71d275812d828c6f089640" +checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38" dependencies = [ "bitflags", "core-foundation", @@ -3484,9 +3429,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.15.0" +version = "2.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +checksum = "321c8673b092a9a42605034a9879d73cb79101ed5fd117bc9a597b89b4e9e61a" dependencies = [ "core-foundation-sys", "libc", @@ -3510,9 +3455,9 @@ checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" [[package]] name = "serde" -version = "1.0.225" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd6c24dee235d0da097043389623fb913daddf92c76e9f5a1db88607a0bcbd1d" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ "serde_core", "serde_derive", @@ -3530,22 +3475,22 @@ dependencies = [ [[package]] name = "serde_core" -version = "1.0.225" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "659356f9a0cb1e529b24c01e43ad2bdf520ec4ceaf83047b83ddcc2251f96383" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.225" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea936adf78b1f766949a4977b91d2f5595825bd6ec079aa9543ad2685fc4516" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -3556,20 +3501,20 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "serde_json" -version = "1.0.145" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ "itoa", "memchr", - "ryu", "serde", "serde_core", + "zmij", ] [[package]] @@ -3581,7 +3526,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -3626,6 +3571,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + [[package]] name = "simdutf8" version = "0.1.5" @@ -3634,15 +3585,15 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "siphasher" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" [[package]] name = "slab" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" @@ -3658,19 +3609,19 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.6.0" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "sqlparser" -version = "0.58.0" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec4b661c54b1e4b603b37873a18c59920e4c51ea8ea2cf527d925424dbd4437c" +checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" dependencies = [ "log", "recursive", @@ -3679,26 +3630,26 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.3.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "stable_deref_trait" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "stacker" -version = "0.1.21" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" dependencies = [ "cc", "cfg-if", @@ -3707,43 +3658,18 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.106", -] - [[package]] name = "strum_macros" version = "0.27.2" @@ -3753,14 +3679,14 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "substrait" -version = "0.58.0" +version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6d24c270c6c672a86c183c3a8439ba46c1936f93cf7296aa692de3b0ff0228" +checksum = "62fc4b483a129b9772ccb9c3f7945a472112fdd9140da87f8a4e7f1d44e045d0" dependencies = [ "heck", "pbjson", @@ -3777,7 +3703,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.106", + "syn 2.0.117", "typify", "walkdir", ] @@ -3801,9 +3727,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.106" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -3827,46 +3753,46 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "target-lexicon" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" [[package]] name = "tempfile" -version = "3.22.0" +version = "3.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84fa4d11fadde498443cca10fd3ac23c951f0dc59e080e9f4b93d4df4e4eea53" +checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom 0.4.1", "once_cell", "rustix", - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] name = "thiserror" -version = "2.0.16" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "2.0.16" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -3891,9 +3817,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ "displaydoc", "zerovec", @@ -3916,48 +3842,57 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.47.1" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" dependencies = [ - "backtrace", "bytes", - "io-uring", "libc", "mio", "pin-project-lite", - "slab", "socket2", "tokio-macros", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "tokio-rustls" -version = "0.26.2" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ "rustls", "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", + "tokio-util", +] + [[package]] name = "tokio-util" -version = "0.7.16" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" dependencies = [ "bytes", "futures-core", @@ -3968,9 +3903,9 @@ dependencies = [ [[package]] name = "tower" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", @@ -3983,9 +3918,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ "bitflags", "bytes", @@ -4013,9 +3948,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.41" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -4024,20 +3959,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "tracing-core" -version = "0.1.34" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", ] @@ -4077,9 +4012,9 @@ checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typenum" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "typewit" @@ -4089,9 +4024,9 @@ checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71" [[package]] name = "typify" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7144144e97e987c94758a3017c920a027feac0799df325d6df4fc8f08d02068e" +checksum = "e6d5bcc6f62eb1fa8aa4098f39b29f93dcb914e17158b76c50360911257aa629" dependencies = [ "typify-impl", "typify-macro", @@ -4099,9 +4034,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "062879d46aa4c9dfe0d33b035bbaf512da192131645d05deacb7033ec8581a09" +checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240" dependencies = [ "heck", "log", @@ -4112,16 +4047,16 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.106", + "syn 2.0.117", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9708a3ceb6660ba3f8d2b8f0567e7d4b8b198e2b94d093b8a6077a751425de9e" +checksum = "911c32f3c8514b048c1b228361bebb5e6d73aeec01696e8cc0e82e2ffef8ab7a" dependencies = [ "proc-macro2", "quote", @@ -4130,15 +4065,15 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.106", + "syn 2.0.117", "typify-impl", ] [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" @@ -4148,15 +4083,15 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] -name = "unindent" -version = "0.2.4" +name = "unicode-xid" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" [[package]] name = "unsafe-libyaml" @@ -4172,9 +4107,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.7" +version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" dependencies = [ "form_urlencoded", "idna", @@ -4190,13 +4125,13 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.18.1" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" +checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.4.1", "js-sys", - "serde", + "serde_core", "wasm-bindgen", ] @@ -4232,28 +4167,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] -name = "wasi" -version = "0.14.7+wasi-0.2.4" +name = "wasip2" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ - "wasip2", + "wit-bindgen", ] [[package]] -name = "wasip2" -version = "1.0.1+wasi-0.2.4" +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.101" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e14915cadd45b529bb8d1f343c4ed0ac1de926144b746e2710f9cd05df6603b" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -4262,27 +4197,14 @@ dependencies = [ "wasm-bindgen-shared", ] -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.101" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28d1ba982ca7923fd01448d5c30c6864d0a14109560296a162f80f305fb93bb" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn 2.0.106", - "wasm-bindgen-shared", -] - [[package]] name = "wasm-bindgen-futures" -version = "0.4.51" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca85039a9b469b38336411d6d6ced91f3fc87109a2a27b0c197663f5144dffe" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -4291,9 +4213,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.101" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c3d463ae3eff775b0c45df9da45d68837702ac35af998361e2c84e7c5ec1b0d" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4301,26 +4223,48 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.101" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bb4ce89b08211f923caf51d527662b75bdc9c9c7aab40f86dcb9fb85ac552aa" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ + "bumpalo", "proc-macro2", "quote", - "syn 2.0.106", - "wasm-bindgen-backend", + "syn 2.0.117", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.101" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f143854a3b13752c6950862c906306adb27c7e839f7414cec8fea35beab624c1" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + [[package]] name = "wasm-streams" version = "0.4.2" @@ -4334,11 +4278,23 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + [[package]] name = "web-sys" -version = "0.3.78" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77e4b637749ff0d92b8fad63aa1f7cff3cbe125fd49c175cd6345e7272638b12" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -4376,7 +4332,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.0", + "windows-sys 0.61.2", ] [[package]] @@ -4387,67 +4343,61 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-core" -version = "0.62.0" +version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57fe7168f7de578d2d8a05b07fd61870d2e73b4020e9f49aa00da8471723497c" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ "windows-implement", "windows-interface", - "windows-link 0.2.0", + "windows-link", "windows-result", "windows-strings", ] [[package]] name = "windows-implement" -version = "0.60.0" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "windows-interface" -version = "0.59.1" +version = "0.59.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "windows-link" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" - -[[package]] -name = "windows-link" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-result" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" dependencies = [ - "windows-link 0.2.0", + "windows-link", ] [[package]] name = "windows-strings" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" dependencies = [ - "windows-link 0.2.0", + "windows-link", ] [[package]] @@ -4474,16 +4424,16 @@ version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.3", + "windows-targets 0.53.5", ] [[package]] name = "windows-sys" -version = "0.61.0" +version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ - "windows-link 0.2.0", + "windows-link", ] [[package]] @@ -4504,19 +4454,19 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.53.3" +version = "0.53.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ - "windows-link 0.1.3", - "windows_aarch64_gnullvm 0.53.0", - "windows_aarch64_msvc 0.53.0", - "windows_i686_gnu 0.53.0", - "windows_i686_gnullvm 0.53.0", - "windows_i686_msvc 0.53.0", - "windows_x86_64_gnu 0.53.0", - "windows_x86_64_gnullvm 0.53.0", - "windows_x86_64_msvc 0.53.0", + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] [[package]] @@ -4527,9 +4477,9 @@ checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" [[package]] name = "windows_aarch64_msvc" @@ -4539,9 +4489,9 @@ checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_aarch64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" [[package]] name = "windows_i686_gnu" @@ -4551,9 +4501,9 @@ checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" [[package]] name = "windows_i686_gnullvm" @@ -4563,9 +4513,9 @@ checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" [[package]] name = "windows_i686_msvc" @@ -4575,9 +4525,9 @@ checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_i686_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" [[package]] name = "windows_x86_64_gnu" @@ -4587,9 +4537,9 @@ checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnu" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" [[package]] name = "windows_x86_64_gnullvm" @@ -4599,9 +4549,9 @@ checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_gnullvm" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" [[package]] name = "windows_x86_64_msvc" @@ -4611,38 +4561,110 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "windows_x86_64_msvc" -version = "0.53.0" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] [[package]] -name = "writeable" -version = "0.6.1" +name = "wit-bindgen-core" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] [[package]] -name = "xz2" -version = "0.1.7" +name = "wit-bindgen-rust" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ - "lzma-sys", + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", ] [[package]] -name = "yoke" -version = "0.8.0" +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ "stable_deref_trait", "yoke-derive", "zerofrom", @@ -4650,34 +4672,34 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.27" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.27" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] @@ -4697,21 +4719,21 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ "displaydoc", "yoke", @@ -4720,9 +4742,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.4" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" dependencies = [ "yoke", "zerofrom", @@ -4731,20 +4753,26 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn 2.0.117", ] [[package]] name = "zlib-rs" -version = "0.5.2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c745c48e1007337ed136dc99df34128b9faa6ed542d80a1c673cf55a6d7236c8" + +[[package]] +name = "zmij" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f06ae92f42f5e5c42443fd094f245eb656abf56dd7cce9b8b263236565e00f2" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 18640b264..19b79daf8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,52 +15,67 @@ # specific language governing permissions and limitations # under the License. -[package] -name = "datafusion-python" -version = "50.0.0" +[workspace.package] +version = "52.0.0" homepage = "https://datafusion.apache.org/python" repository = "https://github.com/apache/datafusion-python" authors = ["Apache DataFusion "] description = "Apache DataFusion DataFrame and SQL Query Engine" readme = "README.md" license = "Apache-2.0" -edition = "2021" -rust-version = "1.78" -include = ["/src", "/datafusion", "/LICENSE.txt", "build.rs", "pyproject.toml", "Cargo.toml", "Cargo.lock"] +edition = "2024" +rust-version = "1.88" -[features] -default = ["mimalloc"] -protoc = [ "datafusion-substrait/protoc" ] -substrait = ["dep:datafusion-substrait"] +[workspace] +members = ["crates/core", "crates/util", "examples/datafusion-ffi-example"] +resolver = "3" -[dependencies] -tokio = { version = "1.47", features = ["macros", "rt", "rt-multi-thread", "sync"] } -pyo3 = { version = "0.25", features = ["extension-module", "abi3", "abi3-py39"] } -pyo3-async-runtimes = { version = "0.25", features = ["tokio-runtime"]} -pyo3-log = "0.12.4" -arrow = { version = "56", features = ["pyarrow"] } -datafusion = { version = "50", features = ["avro", "unicode_expressions"] } -datafusion-substrait = { version = "50", optional = true } -datafusion-proto = { version = "50" } -datafusion-ffi = { version = "50" } -prost = "0.13.1" # keep in line with `datafusion-substrait` -uuid = { version = "1.18", features = ["v4"] } -mimalloc = { version = "0.1", optional = true, default-features = false, features = ["local_dynamic_tls"] } +[workspace.dependencies] +tokio = { version = "1.49" } +pyo3 = { version = "0.28" } +pyo3-async-runtimes = { version = "0.28" } +pyo3-log = "0.13.3" +arrow = { version = "58" } +arrow-array = { version = "58" } +arrow-schema = { version = "58" } +arrow-select = { version = "58" } +datafusion = { version = "53" } +datafusion-substrait = { version = "53" } +datafusion-proto = { version = "53" } +datafusion-ffi = { version = "53" } +datafusion-catalog = { version = "53", default-features = false } +datafusion-common = { version = "53", default-features = false } +datafusion-functions-aggregate = { version = "53" } +datafusion-functions-window = { version = "53" } +datafusion-expr = { version = "53" } +prost = "0.14.3" +serde_json = "1" +uuid = { version = "1.21" } +mimalloc = { version = "0.1", default-features = false } async-trait = "0.1.89" futures = "0.3" -object_store = { version = "0.12.4", features = ["aws", "gcp", "azure", "http"] } +cstr = "0.2" +object_store = { version = "0.13.1" } url = "2" -log = "0.4.27" +log = "0.4.29" parking_lot = "0.12" - -[build-dependencies] -prost-types = "0.13.1" # keep in line with `datafusion-substrait` -pyo3-build-config = "0.25" - -[lib] -name = "datafusion_python" -crate-type = ["cdylib", "rlib"] +prost-types = "0.14.3" # keep in line with `datafusion-substrait` +pyo3-build-config = "0.28" +datafusion-python-util = { path = "crates/util" } [profile.release] lto = true codegen-units = 1 + +# We cannot publish to crates.io with any patches in the below section. Developers +# must remove any entries in this section before creating a release candidate. +[patch.crates-io] +datafusion = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-substrait = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-proto = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-ffi = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-catalog = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-functions-aggregate = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-functions-window = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } +datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f", submodules = false } diff --git a/README.md b/README.md index 0cdf17ab8..c24257876 100644 --- a/README.md +++ b/README.md @@ -275,7 +275,16 @@ needing to activate the virtual environment: ```bash uv run --no-project maturin develop --uv -uv --no-project pytest . +uv run --no-project pytest . +``` + +To run the FFI tests within the examples folder, after you have built +`datafusion-python` with the previous commands: + +```bash +cd examples/datafusion-ffi-example +uv run --no-project maturin develop --uv +uv run --no-project pytest python/tests/_test_*py ``` ### Running & Installing pre-commit hooks diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py index f9e8d638b..533166695 100644 --- a/benchmarks/db-benchmark/groupby-datafusion.py +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -18,6 +18,7 @@ import gc import os import timeit +from pathlib import Path import datafusion as df import pyarrow as pa @@ -34,7 +35,7 @@ print("# groupby-datafusion.py", flush=True) -exec(open("./_helpers/helpers.py").read()) +exec(Path.open("./_helpers/helpers.py").read()) def ans_shape(batches) -> tuple[int, int]: @@ -65,7 +66,7 @@ def execute(df) -> list: sql = True data_name = os.environ["SRC_DATANAME"] -src_grp = os.path.join("data", data_name + ".csv") +src_grp = "data" / data_name / ".csv" print("loading dataset %s" % src_grp, flush=True) schema = pa.schema( diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py index 039868031..3be296c81 100755 --- a/benchmarks/db-benchmark/join-datafusion.py +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -18,6 +18,7 @@ import gc import os import timeit +from pathlib import Path import datafusion as df from datafusion import col @@ -26,7 +27,7 @@ print("# join-datafusion.py", flush=True) -exec(open("./_helpers/helpers.py").read()) +exec(Path.open("./_helpers/helpers.py").read()) def ans_shape(batches) -> tuple[int, int]: @@ -49,12 +50,12 @@ def ans_shape(batches) -> tuple[int, int]: on_disk = "FALSE" data_name = os.environ["SRC_DATANAME"] -src_jn_x = os.path.join("data", data_name + ".csv") +src_jn_x = "data" / data_name / ".csv" y_data_name = join_to_tbls(data_name) src_jn_y = [ - os.path.join("data", y_data_name[0] + ".csv"), - os.path.join("data", y_data_name[1] + ".csv"), - os.path.join("data", y_data_name[2] + ".csv"), + "data" / y_data_name[0] / ".csv", + "data" / y_data_name[1] / ".csv", + "data" / y_data_name[2] / ".csv", ] if len(src_jn_y) != 3: error_msg = "Something went wrong in preparing files used for join" diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py index 2d1bbae5b..ffee5554c 100644 --- a/benchmarks/tpch/tpch.py +++ b/benchmarks/tpch/tpch.py @@ -17,12 +17,13 @@ import argparse import time +from pathlib import Path from datafusion import SessionContext def bench(data_path, query_path) -> None: - with open("results.csv", "w") as results: + with Path("results.csv").open("w") as results: # register tables start = time.time() total_time_millis = 0 @@ -45,7 +46,7 @@ def bench(data_path, query_path) -> None: print("Configuration:\n", ctx) # register tables - with open("create_tables.sql") as f: + with Path("create_tables.sql").open() as f: sql = "" for line in f.readlines(): if line.startswith("--"): @@ -65,7 +66,7 @@ def bench(data_path, query_path) -> None: # run queries for query in range(1, 23): - with open(f"{query_path}/q{query}.sql") as f: + with Path(f"{query_path}/q{query}.sql").open() as f: text = f.read() tmp = text.split(";") queries = [s.strip() for s in tmp if len(s.strip()) > 0] diff --git a/ci/scripts/rust_fmt.sh b/ci/scripts/rust_fmt.sh index 9d8325877..05cb6b208 100755 --- a/ci/scripts/rust_fmt.sh +++ b/ci/scripts/rust_fmt.sh @@ -18,4 +18,4 @@ # under the License. set -ex -cargo fmt --all -- --check +cargo +nightly fmt --all -- --check diff --git a/conftest.py b/conftest.py new file mode 100644 index 000000000..1c89f92bc --- /dev/null +++ b/conftest.py @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Pytest configuration for doctest namespace injection.""" + +import datafusion as dfn +import numpy as np +import pytest + + +@pytest.fixture(autouse=True) +def _doctest_namespace(doctest_namespace: dict) -> None: + """Add common imports to the doctest namespace.""" + doctest_namespace["dfn"] = dfn + doctest_namespace["np"] = np diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml new file mode 100644 index 000000000..3e2b01c8e --- /dev/null +++ b/crates/core/Cargo.toml @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-python" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +description.workspace = true +homepage.workspace = true +repository.workspace = true +include = [ + "src", + "../LICENSE.txt", + "build.rs", + "../pyproject.toml", + "Cargo.toml", + "../Cargo.lock", +] + +[dependencies] +tokio = { workspace = true, features = [ + "macros", + "rt", + "rt-multi-thread", + "sync", +] } +pyo3 = { workspace = true, features = [ + "extension-module", + "abi3", + "abi3-py310", +] } +pyo3-async-runtimes = { workspace = true, features = ["tokio-runtime"] } +pyo3-log = { workspace = true } +arrow = { workspace = true, features = ["pyarrow"] } +arrow-select = { workspace = true } +datafusion = { workspace = true, features = ["avro", "unicode_expressions"] } +datafusion-substrait = { workspace = true, optional = true } +datafusion-proto = { workspace = true } +datafusion-ffi = { workspace = true } +prost = { workspace = true } # keep in line with `datafusion-substrait` +serde_json = { workspace = true } +uuid = { workspace = true, features = ["v4"] } +mimalloc = { workspace = true, optional = true, features = [ + "local_dynamic_tls", +] } +async-trait = { workspace = true } +futures = { workspace = true } +cstr = { workspace = true } +object_store = { workspace = true, features = ["aws", "gcp", "azure", "http"] } +url = { workspace = true } +log = { workspace = true } +parking_lot = { workspace = true } +datafusion-python-util = { workspace = true } + +[build-dependencies] +prost-types = { workspace = true } +pyo3-build-config = { workspace = true } + +[features] +default = ["mimalloc"] +protoc = ["datafusion-substrait/protoc"] +substrait = ["dep:datafusion-substrait"] + +[lib] +name = "datafusion_python" +crate-type = ["cdylib", "rlib"] diff --git a/build.rs b/crates/core/build.rs similarity index 100% rename from build.rs rename to crates/core/build.rs diff --git a/crates/core/src/array.rs b/crates/core/src/array.rs new file mode 100644 index 000000000..99e63ef50 --- /dev/null +++ b/crates/core/src/array.rs @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::ptr::NonNull; +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef}; +use arrow::datatypes::{Field, FieldRef}; +use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; +use arrow::pyarrow::ToPyArrow; +use datafusion_python_util::validate_pycapsule; +use pyo3::ffi::c_str; +use pyo3::prelude::{PyAnyMethods, PyCapsuleMethods}; +use pyo3::types::PyCapsule; +use pyo3::{Bound, PyAny, PyResult, Python, pyclass, pymethods}; + +use crate::errors::PyDataFusionResult; + +/// A Python object which implements the Arrow PyCapsule for importing +/// into other libraries. +#[pyclass( + from_py_object, + name = "ArrowArrayExportable", + module = "datafusion", + frozen +)] +#[derive(Clone)] +pub struct PyArrowArrayExportable { + array: ArrayRef, + field: FieldRef, +} + +#[pymethods] +impl PyArrowArrayExportable { + #[pyo3(signature = (requested_schema=None))] + fn __arrow_c_array__<'py>( + &'py self, + py: Python<'py>, + requested_schema: Option>, + ) -> PyDataFusionResult<(Bound<'py, PyCapsule>, Bound<'py, PyCapsule>)> { + let field = if let Some(schema_capsule) = requested_schema { + validate_pycapsule(&schema_capsule, "arrow_schema")?; + + let data: NonNull = schema_capsule + .pointer_checked(Some(c_str!("arrow_schema")))? + .cast(); + let schema_ptr = unsafe { data.as_ref() }; + let desired_field = Field::try_from(schema_ptr)?; + + Arc::new(desired_field) + } else { + Arc::clone(&self.field) + }; + + let ffi_schema = FFI_ArrowSchema::try_from(&field)?; + let schema_capsule = PyCapsule::new(py, ffi_schema, Some(cr"arrow_schema".into()))?; + + let ffi_array = FFI_ArrowArray::new(&self.array.to_data()); + let array_capsule = PyCapsule::new(py, ffi_array, Some(cr"arrow_array".into()))?; + + Ok((schema_capsule, array_capsule)) + } +} + +impl ToPyArrow for PyArrowArrayExportable { + fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult> { + let module = py.import("pyarrow")?; + let method = module.getattr("array")?; + let array = method.call((self.clone(),), None)?; + Ok(array) + } +} + +impl PyArrowArrayExportable { + pub fn new(array: ArrayRef, field: FieldRef) -> Self { + Self { array, field } + } +} diff --git a/crates/core/src/catalog.rs b/crates/core/src/catalog.rs new file mode 100644 index 000000000..f707e7e5c --- /dev/null +++ b/crates/core/src/catalog.rs @@ -0,0 +1,731 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::collections::HashSet; +use std::ptr::NonNull; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::catalog::{ + CatalogProvider, CatalogProviderList, MemoryCatalogProvider, MemoryCatalogProviderList, + MemorySchemaProvider, SchemaProvider, +}; +use datafusion::common::DataFusionError; +use datafusion::datasource::TableProvider; +use datafusion_ffi::catalog_provider::FFI_CatalogProvider; +use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; +use datafusion_ffi::schema_provider::FFI_SchemaProvider; +use datafusion_python_util::{ + create_logical_extension_capsule, ffi_logical_codec_from_pycapsule, validate_pycapsule, + wait_for_future, +}; +use pyo3::IntoPyObjectExt; +use pyo3::exceptions::PyKeyError; +use pyo3::ffi::c_str; +use pyo3::prelude::*; +use pyo3::types::PyCapsule; + +use crate::context::PySessionContext; +use crate::dataset::Dataset; +use crate::errors::{PyDataFusionError, PyDataFusionResult, py_datafusion_err, to_datafusion_err}; +use crate::table::PyTable; + +#[pyclass( + from_py_object, + frozen, + name = "RawCatalogList", + module = "datafusion.catalog", + subclass +)] +#[derive(Clone)] +pub struct PyCatalogList { + pub catalog_list: Arc, + codec: Arc, +} + +#[pyclass( + from_py_object, + frozen, + name = "RawCatalog", + module = "datafusion.catalog", + subclass +)] +#[derive(Clone)] +pub struct PyCatalog { + pub catalog: Arc, + codec: Arc, +} + +#[pyclass( + from_py_object, + frozen, + name = "RawSchema", + module = "datafusion.catalog", + subclass +)] +#[derive(Clone)] +pub struct PySchema { + pub schema: Arc, + codec: Arc, +} + +impl PyCatalog { + pub(crate) fn new_from_parts( + catalog: Arc, + codec: Arc, + ) -> Self { + Self { catalog, codec } + } +} + +impl PySchema { + pub(crate) fn new_from_parts( + schema: Arc, + codec: Arc, + ) -> Self { + Self { schema, codec } + } +} + +#[pymethods] +impl PyCatalogList { + #[new] + pub fn new( + py: Python, + catalog_list: Py, + session: Option>, + ) -> PyResult { + let codec = extract_logical_extension_codec(py, session)?; + let catalog_list = Arc::new(RustWrappedPyCatalogProviderList::new( + catalog_list, + codec.clone(), + )) as Arc; + Ok(Self { + catalog_list, + codec, + }) + } + + #[staticmethod] + pub fn memory_catalog_list(py: Python, session: Option>) -> PyResult { + let codec = extract_logical_extension_codec(py, session)?; + let catalog_list = + Arc::new(MemoryCatalogProviderList::default()) as Arc; + Ok(Self { + catalog_list, + codec, + }) + } + + pub fn catalog_names(&self) -> HashSet { + self.catalog_list.catalog_names().into_iter().collect() + } + + #[pyo3(signature = (name="public"))] + pub fn catalog(&self, name: &str) -> PyResult> { + let catalog = self + .catalog_list + .catalog(name) + .ok_or(PyKeyError::new_err(format!( + "Schema with name {name} doesn't exist." + )))?; + + Python::attach(|py| { + match catalog + .as_any() + .downcast_ref::() + { + Some(wrapped_catalog) => Ok(wrapped_catalog.catalog_provider.clone_ref(py)), + None => PyCatalog::new_from_parts(catalog, self.codec.clone()).into_py_any(py), + } + }) + } + + pub fn register_catalog(&self, name: &str, catalog_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = extract_catalog_provider_from_pyobj(catalog_provider, self.codec.as_ref())?; + + let _ = self + .catalog_list + .register_catalog(name.to_owned(), provider); + + Ok(()) + } + + pub fn __repr__(&self) -> PyResult { + let mut names: Vec = self.catalog_names().into_iter().collect(); + names.sort(); + Ok(format!("CatalogList(catalog_names=[{}])", names.join(", "))) + } +} + +#[pymethods] +impl PyCatalog { + #[new] + pub fn new(py: Python, catalog: Py, session: Option>) -> PyResult { + let codec = extract_logical_extension_codec(py, session)?; + let catalog = Arc::new(RustWrappedPyCatalogProvider::new(catalog, codec.clone())) + as Arc; + Ok(Self { catalog, codec }) + } + + #[staticmethod] + pub fn memory_catalog(py: Python, session: Option>) -> PyResult { + let codec = extract_logical_extension_codec(py, session)?; + let catalog = Arc::new(MemoryCatalogProvider::default()) as Arc; + Ok(Self { catalog, codec }) + } + + pub fn schema_names(&self) -> HashSet { + self.catalog.schema_names().into_iter().collect() + } + + #[pyo3(signature = (name="public"))] + pub fn schema(&self, name: &str) -> PyResult> { + let schema = self + .catalog + .schema(name) + .ok_or(PyKeyError::new_err(format!( + "Schema with name {name} doesn't exist." + )))?; + + Python::attach(|py| { + match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.schema_provider.clone_ref(py)), + None => PySchema::new_from_parts(schema, self.codec.clone()).into_py_any(py), + } + }) + } + + pub fn register_schema(&self, name: &str, schema_provider: Bound<'_, PyAny>) -> PyResult<()> { + let provider = extract_schema_provider_from_pyobj(schema_provider, self.codec.as_ref())?; + + let _ = self + .catalog + .register_schema(name, provider) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + pub fn deregister_schema(&self, name: &str, cascade: bool) -> PyResult<()> { + let _ = self + .catalog + .deregister_schema(name, cascade) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + pub fn __repr__(&self) -> PyResult { + let mut names: Vec = self.schema_names().into_iter().collect(); + names.sort(); + Ok(format!("Catalog(schema_names=[{}])", names.join(", "))) + } +} + +#[pymethods] +impl PySchema { + #[new] + pub fn new( + py: Python, + schema_provider: Py, + session: Option>, + ) -> PyResult { + let codec = extract_logical_extension_codec(py, session)?; + let schema = + Arc::new(RustWrappedPySchemaProvider::new(schema_provider)) as Arc; + Ok(Self { schema, codec }) + } + + #[staticmethod] + fn memory_schema(py: Python, session: Option>) -> PyResult { + let codec = extract_logical_extension_codec(py, session)?; + let schema = Arc::new(MemorySchemaProvider::default()) as Arc; + Ok(Self { schema, codec }) + } + + #[getter] + fn table_names(&self) -> HashSet { + self.schema.table_names().into_iter().collect() + } + + fn table(&self, name: &str, py: Python) -> PyDataFusionResult { + if let Some(table) = wait_for_future(py, self.schema.table(name))?? { + Ok(PyTable::from(table)) + } else { + Err(PyDataFusionError::Common(format!( + "Table not found: {name}" + ))) + } + } + + fn __repr__(&self) -> PyResult { + let mut names: Vec = self.table_names().into_iter().collect(); + names.sort(); + Ok(format!("Schema(table_names=[{}])", names.join(";"))) + } + + fn register_table(&self, name: &str, table_provider: Bound<'_, PyAny>) -> PyResult<()> { + let py = table_provider.py(); + let codec_capsule = create_logical_extension_capsule(py, self.codec.as_ref())? + .as_any() + .clone(); + + let table = PyTable::new(table_provider, Some(codec_capsule))?; + + let _ = self + .schema + .register_table(name.to_string(), table.table) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn deregister_table(&self, name: &str) -> PyResult<()> { + let _ = self + .schema + .deregister_table(name) + .map_err(py_datafusion_err)?; + + Ok(()) + } + + fn table_exist(&self, name: &str) -> bool { + self.schema.table_exist(name) + } +} + +#[derive(Debug)] +pub(crate) struct RustWrappedPySchemaProvider { + schema_provider: Py, + owner_name: Option, +} + +impl RustWrappedPySchemaProvider { + pub fn new(schema_provider: Py) -> Self { + let owner_name = Python::attach(|py| { + schema_provider + .bind(py) + .getattr("owner_name") + .ok() + .map(|name| name.to_string()) + }); + + Self { + schema_provider, + owner_name, + } + } + + fn table_inner(&self, name: &str) -> PyResult>> { + Python::attach(|py| { + let provider = self.schema_provider.bind(py); + let py_table_method = provider.getattr("table")?; + + let py_table = py_table_method.call((name,), None)?; + if py_table.is_none() { + return Ok(None); + } + + let table = PyTable::new(py_table, None)?; + + Ok(Some(table.table)) + }) + } +} + +#[async_trait] +impl SchemaProvider for RustWrappedPySchemaProvider { + fn owner_name(&self) -> Option<&str> { + self.owner_name.as_deref() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + Python::attach(|py| { + let provider = self.schema_provider.bind(py); + + provider + .getattr("table_names") + .and_then(|names| names.extract::>()) + .unwrap_or_else(|err| { + log::error!("Unable to get table_names: {err}"); + Vec::default() + }) + }) + } + + async fn table( + &self, + name: &str, + ) -> datafusion::common::Result>, DataFusionError> { + self.table_inner(name) + .map_err(|e| DataFusionError::External(Box::new(e))) + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> datafusion::common::Result>> { + let py_table = PyTable::from(table); + Python::attach(|py| { + let provider = self.schema_provider.bind(py); + let _ = provider + .call_method1("register_table", (name, py_table)) + .map_err(to_datafusion_err)?; + // Since the definition of `register_table` says that an error + // will be returned if the table already exists, there is no + // case where we want to return a table provider as output. + Ok(None) + }) + } + + fn deregister_table( + &self, + name: &str, + ) -> datafusion::common::Result>> { + Python::attach(|py| { + let provider = self.schema_provider.bind(py); + let table = provider + .call_method1("deregister_table", (name,)) + .map_err(to_datafusion_err)?; + if table.is_none() { + return Ok(None); + } + + // If we can turn this table provider into a `Dataset`, return it. + // Otherwise, return None. + let dataset = match Dataset::new(&table, py) { + Ok(dataset) => Some(Arc::new(dataset) as Arc), + Err(_) => None, + }; + + Ok(dataset) + }) + } + + fn table_exist(&self, name: &str) -> bool { + Python::attach(|py| { + let provider = self.schema_provider.bind(py); + provider + .call_method1("table_exist", (name,)) + .and_then(|pyobj| pyobj.extract()) + .unwrap_or(false) + }) + } +} + +#[derive(Debug)] +pub(crate) struct RustWrappedPyCatalogProvider { + pub(crate) catalog_provider: Py, + codec: Arc, +} + +impl RustWrappedPyCatalogProvider { + pub fn new(catalog_provider: Py, codec: Arc) -> Self { + Self { + catalog_provider, + codec, + } + } + + fn schema_inner(&self, name: &str) -> PyResult>> { + Python::attach(|py| { + let provider = self.catalog_provider.bind(py); + + let py_schema = provider.call_method1("schema", (name,))?; + if py_schema.is_none() { + return Ok(None); + } + + extract_schema_provider_from_pyobj(py_schema, self.codec.as_ref()).map(Some) + }) + } +} + +#[async_trait] +impl CatalogProvider for RustWrappedPyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + Python::attach(|py| { + let provider = self.catalog_provider.bind(py); + provider + .call_method0("schema_names") + .and_then(|names| names.extract::>()) + .map(|names| names.into_iter().collect()) + .unwrap_or_else(|err| { + log::error!("Unable to get schema_names: {err}"); + Vec::default() + }) + }) + } + + fn schema(&self, name: &str) -> Option> { + self.schema_inner(name).unwrap_or_else(|err| { + log::error!("CatalogProvider schema returned error: {err}"); + None + }) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> datafusion::common::Result>> { + Python::attach(|py| { + let py_schema = match schema + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => wrapped_schema.schema_provider.as_any(), + None => &PySchema::new_from_parts(schema, self.codec.clone()) + .into_py_any(py) + .map_err(to_datafusion_err)?, + }; + + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("register_schema", (name, py_schema)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } + + fn deregister_schema( + &self, + name: &str, + cascade: bool, + ) -> datafusion::common::Result>> { + Python::attach(|py| { + let provider = self.catalog_provider.bind(py); + let schema = provider + .call_method1("deregister_schema", (name, cascade)) + .map_err(to_datafusion_err)?; + if schema.is_none() { + return Ok(None); + } + + let schema = Arc::new(RustWrappedPySchemaProvider::new(schema.into())) + as Arc; + + Ok(Some(schema)) + }) + } +} + +#[derive(Debug)] +pub(crate) struct RustWrappedPyCatalogProviderList { + pub(crate) catalog_provider_list: Py, + codec: Arc, +} + +impl RustWrappedPyCatalogProviderList { + pub fn new(catalog_provider_list: Py, codec: Arc) -> Self { + Self { + catalog_provider_list, + codec, + } + } + + fn catalog_inner(&self, name: &str) -> PyResult>> { + Python::attach(|py| { + let provider = self.catalog_provider_list.bind(py); + + let py_schema = provider.call_method1("catalog", (name,))?; + if py_schema.is_none() { + return Ok(None); + } + + extract_catalog_provider_from_pyobj(py_schema, self.codec.as_ref()).map(Some) + }) + } +} + +#[async_trait] +impl CatalogProviderList for RustWrappedPyCatalogProviderList { + fn as_any(&self) -> &dyn Any { + self + } + + fn catalog_names(&self) -> Vec { + Python::attach(|py| { + let provider = self.catalog_provider_list.bind(py); + provider + .call_method0("catalog_names") + .and_then(|names| names.extract::>()) + .map(|names| names.into_iter().collect()) + .unwrap_or_else(|err| { + log::error!("Unable to get catalog_names: {err}"); + Vec::default() + }) + }) + } + + fn catalog(&self, name: &str) -> Option> { + self.catalog_inner(name).unwrap_or_else(|err| { + log::error!("CatalogProvider catalog returned error: {err}"); + None + }) + } + + fn register_catalog( + &self, + name: String, + catalog: Arc, + ) -> Option> { + Python::attach(|py| { + let py_catalog = match catalog + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => wrapped_schema.catalog_provider.as_any().clone_ref(py), + None => { + match PyCatalog::new_from_parts(catalog, self.codec.clone()).into_py_any(py) { + Ok(c) => c, + Err(err) => { + log::error!( + "register_catalog returned error during conversion to PyAny: {err}" + ); + return None; + } + } + } + }; + + let provider = self.catalog_provider_list.bind(py); + let catalog = match provider.call_method1("register_catalog", (name, py_catalog)) { + Ok(c) => c, + Err(err) => { + log::error!("register_catalog returned error: {err}"); + return None; + } + }; + if catalog.is_none() { + return None; + } + + let catalog = Arc::new(RustWrappedPyCatalogProvider::new( + catalog.into(), + self.codec.clone(), + )) as Arc; + + Some(catalog) + }) + } +} + +fn extract_catalog_provider_from_pyobj( + mut catalog_provider: Bound, + codec: &FFI_LogicalExtensionCodec, +) -> PyResult> { + if catalog_provider.hasattr("__datafusion_catalog_provider__")? { + let py = catalog_provider.py(); + let codec_capsule = create_logical_extension_capsule(py, codec)?; + catalog_provider = catalog_provider + .getattr("__datafusion_catalog_provider__")? + .call1((codec_capsule,))?; + } + + let provider = if let Ok(capsule) = catalog_provider.cast::() { + validate_pycapsule(capsule, "datafusion_catalog_provider")?; + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_catalog_provider")))? + .cast(); + let provider = unsafe { data.as_ref() }; + let provider: Arc = provider.into(); + provider as Arc + } else { + match catalog_provider.extract::() { + Ok(py_catalog) => py_catalog.catalog, + Err(_) => Arc::new(RustWrappedPyCatalogProvider::new( + catalog_provider.into(), + Arc::new(codec.clone()), + )) as Arc, + } + }; + + Ok(provider) +} + +fn extract_schema_provider_from_pyobj( + mut schema_provider: Bound, + codec: &FFI_LogicalExtensionCodec, +) -> PyResult> { + if schema_provider.hasattr("__datafusion_schema_provider__")? { + let py = schema_provider.py(); + let codec_capsule = create_logical_extension_capsule(py, codec)?; + schema_provider = schema_provider + .getattr("__datafusion_schema_provider__")? + .call1((codec_capsule,))?; + } + + let provider = if let Ok(capsule) = schema_provider.cast::() { + validate_pycapsule(capsule, "datafusion_schema_provider")?; + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_schema_provider")))? + .cast(); + let provider = unsafe { data.as_ref() }; + let provider: Arc = provider.into(); + provider as Arc + } else { + match schema_provider.extract::() { + Ok(py_schema) => py_schema.schema, + Err(_) => Arc::new(RustWrappedPySchemaProvider::new(schema_provider.into())) + as Arc, + } + }; + + Ok(provider) +} + +fn extract_logical_extension_codec( + py: Python, + obj: Option>, +) -> PyResult> { + let obj = match obj { + Some(obj) => obj, + None => PySessionContext::global_ctx()?.into_bound_py_any(py)?, + }; + ffi_logical_codec_from_pycapsule(obj).map(Arc::new) +} + +pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + Ok(()) +} diff --git a/src/common.rs b/crates/core/src/common.rs similarity index 100% rename from src/common.rs rename to crates/core/src/common.rs diff --git a/src/common/data_type.rs b/crates/core/src/common/data_type.rs similarity index 95% rename from src/common/data_type.rs rename to crates/core/src/common/data_type.rs index 3cbe31332..af4179806 100644 --- a/src/common/data_type.rs +++ b/crates/core/src/common/data_type.rs @@ -15,13 +15,18 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + use datafusion::arrow::array::Array; use datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; use datafusion::common::ScalarValue; -use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; -use pyo3::exceptions::PyNotImplementedError; -use pyo3::{exceptions::PyValueError, prelude::*}; +use datafusion::logical_expr::expr::NullTreatment as DFNullTreatment; +use pyo3::exceptions::{PyNotImplementedError, PyValueError}; +use pyo3::prelude::*; +/// A [`ScalarValue`] wrapped in a Python object. This struct allows for conversion +/// from a variety of Python objects into a [`ScalarValue`]. See +/// ``FromPyArrow::from_pyarrow_bound`` conversion details. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] pub struct PyScalarValue(pub ScalarValue); @@ -37,7 +42,14 @@ impl From for ScalarValue { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(frozen, eq, eq_int, name = "RexType", module = "datafusion.common")] +#[pyclass( + from_py_object, + frozen, + eq, + eq_int, + name = "RexType", + module = "datafusion.common" +)] pub enum RexType { Alias, Literal, @@ -58,7 +70,12 @@ pub enum RexType { /// to map types from one system to another. // TODO: This looks like this needs pyo3 tracking so leaving unfrozen for now #[derive(Debug, Clone)] -#[pyclass(name = "DataTypeMap", module = "datafusion.common", subclass)] +#[pyclass( + from_py_object, + name = "DataTypeMap", + module = "datafusion.common", + subclass +)] pub struct DataTypeMap { #[pyo3(get, set)] pub arrow_type: PyDataType, @@ -261,6 +278,12 @@ impl DataTypeMap { ScalarValue::Float16(_) => Ok(DataType::Float16), ScalarValue::Float32(_) => Ok(DataType::Float32), ScalarValue::Float64(_) => Ok(DataType::Float64), + ScalarValue::Decimal32(_, precision, scale) => { + Ok(DataType::Decimal32(*precision, *scale)) + } + ScalarValue::Decimal64(_, precision, scale) => { + Ok(DataType::Decimal64(*precision, *scale)) + } ScalarValue::Decimal128(_, precision, scale) => { Ok(DataType::Decimal128(*precision, *scale)) } @@ -338,6 +361,10 @@ impl DataTypeMap { ScalarValue::Map(_) => Err(PyNotImplementedError::new_err( "ScalarValue::Map".to_string(), )), + ScalarValue::RunEndEncoded(field1, field2, _) => Ok(DataType::RunEndEncoded( + Arc::clone(field1), + Arc::clone(field2), + )), } } } @@ -578,7 +605,12 @@ impl DataTypeMap { /// Since `DataType` exists in another package we cannot make that happen here so we wrap /// `DataType` as `PyDataType` This exists solely to satisfy those constraints. #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(frozen, name = "DataType", module = "datafusion.common")] +#[pyclass( + from_py_object, + frozen, + name = "DataType", + module = "datafusion.common" +)] pub struct PyDataType { pub data_type: DataType, } @@ -636,7 +668,14 @@ impl From for PyDataType { /// Represents the possible Python types that can be mapped to the SQL types #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(frozen, eq, eq_int, name = "PythonType", module = "datafusion.common")] +#[pyclass( + from_py_object, + frozen, + eq, + eq_int, + name = "PythonType", + module = "datafusion.common" +)] pub enum PythonType { Array, Bool, @@ -656,7 +695,14 @@ pub enum PythonType { #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(frozen, eq, eq_int, name = "SqlType", module = "datafusion.common")] +#[pyclass( + from_py_object, + frozen, + eq, + eq_int, + name = "SqlType", + module = "datafusion.common" +)] pub enum SqlType { ANY, ARRAY, @@ -715,6 +761,7 @@ pub enum SqlType { #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[pyclass( + from_py_object, frozen, eq, eq_int, diff --git a/src/common/df_schema.rs b/crates/core/src/common/df_schema.rs similarity index 93% rename from src/common/df_schema.rs rename to crates/core/src/common/df_schema.rs index eb62469cf..9167e772e 100644 --- a/src/common/df_schema.rs +++ b/crates/core/src/common/df_schema.rs @@ -21,7 +21,13 @@ use datafusion::common::DFSchema; use pyo3::prelude::*; #[derive(Debug, Clone)] -#[pyclass(frozen, name = "DFSchema", module = "datafusion.common", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "DFSchema", + module = "datafusion.common", + subclass +)] pub struct PyDFSchema { schema: Arc, } diff --git a/src/common/function.rs b/crates/core/src/common/function.rs similarity index 93% rename from src/common/function.rs rename to crates/core/src/common/function.rs index bc6f23160..41cab515f 100644 --- a/src/common/function.rs +++ b/crates/core/src/common/function.rs @@ -22,7 +22,13 @@ use pyo3::prelude::*; use super::data_type::PyDataType; -#[pyclass(frozen, name = "SqlFunction", module = "datafusion.common", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "SqlFunction", + module = "datafusion.common", + subclass +)] #[derive(Debug, Clone)] pub struct SqlFunction { pub name: String, diff --git a/src/common/schema.rs b/crates/core/src/common/schema.rs similarity index 91% rename from src/common/schema.rs rename to crates/core/src/common/schema.rs index 14ab630d3..29a27b204 100644 --- a/src/common/schema.rs +++ b/crates/core/src/common/schema.rs @@ -15,27 +15,32 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; +use std::borrow::Cow; use std::fmt::{self, Display, Formatter}; use std::sync::Arc; -use std::{any::Any, borrow::Cow}; use arrow::datatypes::Schema; use arrow::pyarrow::PyArrowType; use datafusion::arrow::datatypes::SchemaRef; use datafusion::common::Constraints; use datafusion::datasource::TableType; +use datafusion::logical_expr::utils::split_conjunction; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableSource}; +use parking_lot::RwLock; use pyo3::prelude::*; -use datafusion::logical_expr::utils::split_conjunction; - +use super::data_type::DataTypeMap; +use super::function::SqlFunction; use crate::sql::logical::PyLogicalPlan; -use super::{data_type::DataTypeMap, function::SqlFunction}; - -use parking_lot::RwLock; - -#[pyclass(name = "SqlSchema", module = "datafusion.common", subclass, frozen)] +#[pyclass( + from_py_object, + name = "SqlSchema", + module = "datafusion.common", + subclass, + frozen +)] #[derive(Debug, Clone)] pub struct SqlSchema { name: Arc>, @@ -44,7 +49,12 @@ pub struct SqlSchema { functions: Arc>>, } -#[pyclass(name = "SqlTable", module = "datafusion.common", subclass)] +#[pyclass( + from_py_object, + name = "SqlTable", + module = "datafusion.common", + subclass +)] #[derive(Debug, Clone)] pub struct SqlTable { #[pyo3(get, set)] @@ -88,7 +98,12 @@ impl SqlTable { } } -#[pyclass(name = "SqlView", module = "datafusion.common", subclass)] +#[pyclass( + from_py_object, + name = "SqlView", + module = "datafusion.common", + subclass +)] #[derive(Debug, Clone)] pub struct SqlView { #[pyo3(get, set)] @@ -248,7 +263,13 @@ fn is_supported_push_down_expr(_expr: &Expr) -> bool { true } -#[pyclass(frozen, name = "SqlStatistics", module = "datafusion.common", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "SqlStatistics", + module = "datafusion.common", + subclass +)] #[derive(Debug, Clone)] pub struct SqlStatistics { row_count: f64, @@ -267,7 +288,13 @@ impl SqlStatistics { } } -#[pyclass(frozen, name = "Constraints", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Constraints", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyConstraints { pub constraints: Constraints, @@ -292,7 +319,14 @@ impl Display for PyConstraints { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(frozen, eq, eq_int, name = "TableType", module = "datafusion.common")] +#[pyclass( + from_py_object, + frozen, + eq, + eq_int, + name = "TableType", + module = "datafusion.common" +)] pub enum PyTableType { Base, View, @@ -319,7 +353,13 @@ impl From for PyTableType { } } -#[pyclass(frozen, name = "TableSource", module = "datafusion.common", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "TableSource", + module = "datafusion.common", + subclass +)] #[derive(Clone)] pub struct PyTableSource { pub table_source: Arc, diff --git a/src/config.rs b/crates/core/src/config.rs similarity index 86% rename from src/config.rs rename to crates/core/src/config.rs index 1726e5d9b..fdb693a12 100644 --- a/src/config.rs +++ b/crates/core/src/config.rs @@ -17,15 +17,20 @@ use std::sync::Arc; +use datafusion::config::ConfigOptions; +use parking_lot::RwLock; use pyo3::prelude::*; use pyo3::types::*; -use datafusion::config::ConfigOptions; - +use crate::common::data_type::PyScalarValue; use crate::errors::PyDataFusionResult; -use crate::utils::py_obj_to_scalar_value; -use parking_lot::RwLock; -#[pyclass(name = "Config", module = "datafusion", subclass, frozen)] +#[pyclass( + from_py_object, + name = "Config", + module = "datafusion", + subclass, + frozen +)] #[derive(Clone)] pub(crate) struct PyConfig { config: Arc>, @@ -65,15 +70,15 @@ impl PyConfig { } /// Set a configuration option - pub fn set(&self, key: &str, value: PyObject, py: Python) -> PyDataFusionResult<()> { - let scalar_value = py_obj_to_scalar_value(py, value)?; + pub fn set(&self, key: &str, value: Py, py: Python) -> PyDataFusionResult<()> { + let scalar_value: PyScalarValue = value.extract(py)?; let mut options = self.config.write(); - options.set(key, scalar_value.to_string().as_str())?; + options.set(key, scalar_value.0.to_string().as_str())?; Ok(()) } /// Get all configuration options - pub fn get_all(&self, py: Python) -> PyResult { + pub fn get_all(&self, py: Python) -> PyResult> { let entries: Vec<(String, Option)> = { let options = self.config.read(); options diff --git a/src/context.rs b/crates/core/src/context.rs similarity index 77% rename from src/context.rs rename to crates/core/src/context.rs index dc18a7676..200b6470b 100644 --- a/src/context.rs +++ b/crates/core/src/context.rs @@ -17,49 +17,25 @@ use std::collections::{HashMap, HashSet}; use std::path::PathBuf; +use std::ptr::NonNull; use std::str::FromStr; use std::sync::Arc; use arrow::array::RecordBatchReader; use arrow::ffi_stream::ArrowArrayStreamReader; use arrow::pyarrow::FromPyArrow; -use datafusion::execution::session_state::SessionStateBuilder; -use object_store::ObjectStore; -use url::Url; -use uuid::Uuid; - -use pyo3::exceptions::{PyKeyError, PyValueError}; -use pyo3::prelude::*; - -use crate::catalog::{PyCatalog, RustWrappedPyCatalogProvider}; -use crate::dataframe::PyDataFrame; -use crate::dataset::Dataset; -use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; -use crate::expr::sort_expr::PySortExpr; -use crate::physical_plan::PyExecutionPlan; -use crate::record_batch::PyRecordBatchStream; -use crate::sql::exceptions::py_value_err; -use crate::sql::logical::PyLogicalPlan; -use crate::store::StorageContexts; -use crate::table::PyTable; -use crate::udaf::PyAggregateUDF; -use crate::udf::PyScalarUDF; -use crate::udtf::PyTableFunction; -use crate::udwf::PyWindowUDF; -use crate::utils::{get_global_ctx, get_tokio_runtime, validate_pycapsule, wait_for_future}; use datafusion::arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; -use datafusion::catalog::CatalogProvider; -use datafusion::common::TableReference; -use datafusion::common::{exec_err, ScalarValue}; +use datafusion::catalog::{CatalogProvider, CatalogProviderList, TableProviderFactory}; +use datafusion::common::{ScalarValue, TableReference, exec_err}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::{ ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, }; -use datafusion::datasource::MemTable; -use datafusion::datasource::TableProvider; +use datafusion::datasource::{MemTable, TableProvider}; +use datafusion::execution::TaskContextProvider; use datafusion::execution::context::{ DataFilePaths, SQLOptions, SessionConfig, SessionContext, TaskContext, }; @@ -67,17 +43,59 @@ use datafusion::execution::disk_manager::DiskManagerMode; use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool, UnboundedMemoryPool}; use datafusion::execution::options::ReadOptions; use datafusion::execution::runtime_env::RuntimeEnvBuilder; -use datafusion::physical_plan::SendableRecordBatchStream; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::prelude::{ - AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions, + AvroReadOptions, CsvReadOptions, DataFrame, JsonReadOptions, ParquetReadOptions, +}; +use datafusion_ffi::catalog_provider::FFI_CatalogProvider; +use datafusion_ffi::catalog_provider_list::FFI_CatalogProviderList; +use datafusion_ffi::execution::FFI_TaskContextProvider; +use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; +use datafusion_ffi::table_provider_factory::FFI_TableProviderFactory; +use datafusion_proto::logical_plan::DefaultLogicalExtensionCodec; +use datafusion_python_util::{ + create_logical_extension_capsule, ffi_logical_codec_from_pycapsule, get_global_ctx, + get_tokio_runtime, spawn_future, validate_pycapsule, wait_for_future, }; -use datafusion_ffi::catalog_provider::{FFI_CatalogProvider, ForeignCatalogProvider}; -use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple, PyType}; +use object_store::ObjectStore; use pyo3::IntoPyObjectExt; -use tokio::task::JoinHandle; +use pyo3::exceptions::{PyKeyError, PyValueError}; +use pyo3::ffi::c_str; +use pyo3::prelude::*; +use pyo3::types::{PyCapsule, PyDict, PyList, PyTuple}; +use url::Url; +use uuid::Uuid; + +use crate::catalog::{ + PyCatalog, PyCatalogList, RustWrappedPyCatalogProvider, RustWrappedPyCatalogProviderList, +}; +use crate::common::data_type::PyScalarValue; +use crate::dataframe::PyDataFrame; +use crate::dataset::Dataset; +use crate::errors::{ + PyDataFusionError, PyDataFusionResult, from_datafusion_error, py_datafusion_err, +}; +use crate::expr::sort_expr::PySortExpr; +use crate::options::PyCsvReadOptions; +use crate::physical_plan::PyExecutionPlan; +use crate::record_batch::PyRecordBatchStream; +use crate::sql::logical::PyLogicalPlan; +use crate::sql::util::replace_placeholders_with_strings; +use crate::store::StorageContexts; +use crate::table::{PyTable, RustWrappedPyTableProviderFactory}; +use crate::udaf::PyAggregateUDF; +use crate::udf::PyScalarUDF; +use crate::udtf::PyTableFunction; +use crate::udwf::PyWindowUDF; /// Configuration options for a SessionContext -#[pyclass(frozen, name = "SessionConfig", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "SessionConfig", + module = "datafusion", + subclass +)] #[derive(Clone, Default)] pub struct PySessionConfig { pub config: SessionConfig, @@ -170,7 +188,13 @@ impl PySessionConfig { } /// Runtime options for a SessionContext -#[pyclass(frozen, name = "RuntimeEnvBuilder", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "RuntimeEnvBuilder", + module = "datafusion", + subclass +)] #[derive(Clone)] pub struct PyRuntimeEnvBuilder { pub builder: RuntimeEnvBuilder, @@ -257,7 +281,13 @@ impl PyRuntimeEnvBuilder { } /// `PySQLOptions` allows you to specify options to the sql execution. -#[pyclass(frozen, name = "SQLOptions", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "SQLOptions", + module = "datafusion", + subclass +)] #[derive(Clone)] pub struct PySQLOptions { pub options: SQLOptions, @@ -296,10 +326,17 @@ impl PySQLOptions { /// `PySessionContext` is able to plan and execute DataFusion plans. /// It has a powerful optimizer, a physical planner for local execution, and a /// multi-threaded execution engine to perform the execution. -#[pyclass(frozen, name = "SessionContext", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "SessionContext", + module = "datafusion", + subclass +)] #[derive(Clone)] pub struct PySessionContext { - pub ctx: SessionContext, + pub ctx: Arc, + logical_codec: Arc, } #[pymethods] @@ -326,23 +363,24 @@ impl PySessionContext { .with_runtime_env(runtime) .with_default_features() .build(); - Ok(PySessionContext { - ctx: SessionContext::new_with_state(session_state), - }) + let ctx = Arc::new(SessionContext::new_with_state(session_state)); + let logical_codec = Self::default_logical_codec(&ctx); + Ok(PySessionContext { ctx, logical_codec }) } pub fn enable_url_table(&self) -> PyResult { Ok(PySessionContext { - ctx: self.ctx.clone().enable_url_table(), + ctx: Arc::new(self.ctx.as_ref().clone().enable_url_table()), + logical_codec: Arc::clone(&self.logical_codec), }) } - #[classmethod] + #[staticmethod] #[pyo3(signature = ())] - fn global_ctx(_cls: &Bound<'_, PyType>) -> PyResult { - Ok(Self { - ctx: get_global_ctx().clone(), - }) + pub fn global_ctx() -> PyResult { + let ctx = get_global_ctx().clone(); + let logical_codec = Self::default_logical_codec(&ctx); + Ok(Self { ctx, logical_codec }) } /// Register an object store with the given name @@ -427,27 +465,42 @@ impl PySessionContext { self.ctx.register_udtf(&name, func); } - /// Returns a PyDataFrame whose plan corresponds to the SQL statement. - pub fn sql(&self, query: &str, py: Python) -> PyDataFusionResult { - let result = self.ctx.sql(query); - let df = wait_for_future(py, result)??; - Ok(PyDataFrame::new(df)) - } - - #[pyo3(signature = (query, options=None))] + #[pyo3(signature = (query, options=None, param_values=HashMap::default(), param_strings=HashMap::default()))] pub fn sql_with_options( &self, - query: &str, - options: Option, py: Python, + mut query: String, + options: Option, + param_values: HashMap, + param_strings: HashMap, ) -> PyDataFusionResult { let options = if let Some(options) = options { options.options } else { SQLOptions::new() }; - let result = self.ctx.sql_with_options(query, options); - let df = wait_for_future(py, result)??; + + let param_values = param_values + .into_iter() + .map(|(name, value)| (name, ScalarValue::from(value))) + .collect::>(); + + let state = self.ctx.state(); + let dialect = state.config().options().sql_parser.dialect.as_ref(); + + if !param_strings.is_empty() { + query = replace_placeholders_with_strings(&query, dialect, param_strings)?; + } + + let mut df = wait_for_future(py, async { + self.ctx.sql_with_options(&query, options).await + })? + .map_err(from_datafusion_error)?; + + if !param_values.is_empty() { + df = df.with_param_values(param_values)?; + } + Ok(PyDataFrame::new(df)) } @@ -555,7 +608,7 @@ impl PySessionContext { (array.schema().as_ref().to_owned(), vec![array]) } else { - return Err(crate::errors::PyDataFusionError::Common( + return Err(PyDataFusionError::Common( "Expected either a Arrow Array or Arrow Stream in from_arrow().".to_string(), )); }; @@ -595,7 +648,8 @@ impl PySessionContext { } pub fn register_table(&self, name: &str, table: Bound<'_, PyAny>) -> PyDataFusionResult<()> { - let table = PyTable::new(&table)?; + let session = self.clone().into_bound_py_any(table.py())?; + let table = PyTable::new(table, Some(session))?; self.ctx.register_table(name, table.table)?; Ok(()) @@ -606,26 +660,110 @@ impl PySessionContext { Ok(()) } + pub fn register_table_factory( + &self, + format: &str, + mut factory: Bound<'_, PyAny>, + ) -> PyDataFusionResult<()> { + if factory.hasattr("__datafusion_table_provider_factory__")? { + let py = factory.py(); + let codec_capsule = create_logical_extension_capsule(py, self.logical_codec.as_ref())?; + factory = factory + .getattr("__datafusion_table_provider_factory__")? + .call1((codec_capsule,))?; + } + + let factory: Arc = + if let Ok(capsule) = factory.cast::().map_err(py_datafusion_err) { + validate_pycapsule(capsule, "datafusion_table_provider_factory")?; + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_table_provider_factory")))? + .cast(); + let factory = unsafe { data.as_ref() }; + factory.into() + } else { + Arc::new(RustWrappedPyTableProviderFactory::new( + factory.into(), + self.logical_codec.clone(), + )) + }; + + let st = self.ctx.state_ref(); + let mut lock = st.write(); + lock.table_factories_mut() + .insert(format.to_owned(), factory); + + Ok(()) + } + + pub fn register_catalog_provider_list( + &self, + mut provider: Bound, + ) -> PyDataFusionResult<()> { + if provider.hasattr("__datafusion_catalog_provider_list__")? { + let py = provider.py(); + let codec_capsule = create_logical_extension_capsule(py, self.logical_codec.as_ref())?; + provider = provider + .getattr("__datafusion_catalog_provider_list__")? + .call1((codec_capsule,))?; + } + + let provider = if let Ok(capsule) = provider.cast::().map_err(py_datafusion_err) + { + validate_pycapsule(capsule, "datafusion_catalog_provider_list")?; + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_catalog_provider_list")))? + .cast(); + let provider = unsafe { data.as_ref() }; + let provider: Arc = provider.into(); + provider as Arc + } else { + match provider.extract::() { + Ok(py_catalog_list) => py_catalog_list.catalog_list, + Err(_) => Arc::new(RustWrappedPyCatalogProviderList::new( + provider.into(), + Arc::clone(&self.logical_codec), + )) as Arc, + } + }; + + self.ctx.register_catalog_list(provider); + + Ok(()) + } + pub fn register_catalog_provider( &self, name: &str, - provider: Bound<'_, PyAny>, + mut provider: Bound<'_, PyAny>, ) -> PyDataFusionResult<()> { - let provider = if provider.hasattr("__datafusion_catalog_provider__")? { - let capsule = provider + if provider.hasattr("__datafusion_catalog_provider__")? { + let py = provider.py(); + let codec_capsule = create_logical_extension_capsule(py, self.logical_codec.as_ref())?; + provider = provider .getattr("__datafusion_catalog_provider__")? - .call0()?; - let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + .call1((codec_capsule,))?; + } + + let provider = if let Ok(capsule) = provider.cast::().map_err(py_datafusion_err) + { validate_pycapsule(capsule, "datafusion_catalog_provider")?; - let provider = unsafe { capsule.reference::() }; - let provider: ForeignCatalogProvider = provider.into(); - Arc::new(provider) as Arc + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_catalog_provider")))? + .cast(); + let provider = unsafe { data.as_ref() }; + let provider: Arc = provider.into(); + provider as Arc } else { match provider.extract::() { Ok(py_catalog) => py_catalog.catalog, - Err(_) => Arc::new(RustWrappedPyCatalogProvider::new(provider.into())) - as Arc, + Err(_) => Arc::new(RustWrappedPyCatalogProvider::new( + provider.into(), + Arc::clone(&self.logical_codec), + )) as Arc, } }; @@ -696,41 +834,20 @@ impl PySessionContext { Ok(()) } - #[allow(clippy::too_many_arguments)] #[pyo3(signature = (name, path, - schema=None, - has_header=true, - delimiter=",", - schema_infer_max_records=1000, - file_extension=".csv", - file_compression_type=None))] + options=None))] pub fn register_csv( &self, name: &str, path: &Bound<'_, PyAny>, - schema: Option>, - has_header: bool, - delimiter: &str, - schema_infer_max_records: usize, - file_extension: &str, - file_compression_type: Option, + options: Option<&PyCsvReadOptions>, py: Python, ) -> PyDataFusionResult<()> { - let delimiter = delimiter.as_bytes(); - if delimiter.len() != 1 { - return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( - "Delimiter must be a single character", - ))); - } - - let mut options = CsvReadOptions::new() - .has_header(has_header) - .delimiter(delimiter[0]) - .schema_infer_max_records(schema_infer_max_records) - .file_extension(file_extension) - .file_compression_type(parse_file_compression_type(file_compression_type)?); - options.schema = schema.as_ref().map(|x| &x.0); + let options = options + .map(|opts| opts.try_into()) + .transpose()? + .unwrap_or_default(); if path.is_instance_of::() { let paths = path.extract::>()?; @@ -768,7 +885,7 @@ impl PySessionContext { .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; - let mut options = NdJsonReadOptions::default() + let mut options = JsonReadOptions::default() .file_compression_type(parse_file_compression_type(file_compression_type)?) .table_partition_cols( table_partition_cols @@ -850,20 +967,21 @@ impl PySessionContext { } #[pyo3(signature = (name="datafusion"))] - pub fn catalog(&self, name: &str) -> PyResult { + pub fn catalog(&self, py: Python, name: &str) -> PyResult> { let catalog = self.ctx.catalog(name).ok_or(PyKeyError::new_err(format!( "Catalog with name {name} doesn't exist." )))?; - Python::with_gil(|py| { - match catalog - .as_any() - .downcast_ref::() - { - Some(wrapped_schema) => Ok(wrapped_schema.catalog_provider.clone_ref(py)), - None => PyCatalog::from(catalog).into_py_any(py), - } - }) + match catalog + .as_any() + .downcast_ref::() + { + Some(wrapped_schema) => Ok(wrapped_schema.catalog_provider.clone_ref(py)), + None => Ok( + PyCatalog::new_from_parts(catalog, Arc::clone(&self.logical_codec)) + .into_py_any(py)?, + ), + } } pub fn catalog_names(&self) -> HashSet { @@ -891,10 +1009,10 @@ impl PySessionContext { match res { Ok(df) => Ok(PyDataFrame::new(df)), Err(e) => { - if let datafusion::error::DataFusionError::Plan(msg) = &e { - if msg.contains("No table named") { - return Err(PyKeyError::new_err(msg.to_string())); - } + if let datafusion::error::DataFusionError::Plan(msg) = &e + && msg.contains("No table named") + { + return Err(PyKeyError::new_err(msg.to_string())); } Err(py_datafusion_err(e)) } @@ -928,7 +1046,7 @@ impl PySessionContext { let path = path .to_str() .ok_or_else(|| PyValueError::new_err("Unable to convert path to a string"))?; - let mut options = NdJsonReadOptions::default() + let mut options = JsonReadOptions::default() .table_partition_cols( table_partition_cols .into_iter() @@ -949,48 +1067,19 @@ impl PySessionContext { Ok(PyDataFrame::new(df)) } - #[allow(clippy::too_many_arguments)] #[pyo3(signature = ( path, - schema=None, - has_header=true, - delimiter=",", - schema_infer_max_records=1000, - file_extension=".csv", - table_partition_cols=vec![], - file_compression_type=None))] + options=None))] pub fn read_csv( &self, path: &Bound<'_, PyAny>, - schema: Option>, - has_header: bool, - delimiter: &str, - schema_infer_max_records: usize, - file_extension: &str, - table_partition_cols: Vec<(String, PyArrowType)>, - file_compression_type: Option, + options: Option<&PyCsvReadOptions>, py: Python, ) -> PyDataFusionResult { - let delimiter = delimiter.as_bytes(); - if delimiter.len() != 1 { - return Err(crate::errors::PyDataFusionError::PythonError(py_value_err( - "Delimiter must be a single character", - ))); - }; - - let mut options = CsvReadOptions::new() - .has_header(has_header) - .delimiter(delimiter[0]) - .schema_infer_max_records(schema_infer_max_records) - .file_extension(file_extension) - .table_partition_cols( - table_partition_cols - .into_iter() - .map(|(name, ty)| (name, ty.0)) - .collect::>(), - ) - .file_compression_type(parse_file_compression_type(file_compression_type)?); - options.schema = schema.as_ref().map(|x| &x.0); + let options = options + .map(|opts| opts.try_into()) + .transpose()? + .unwrap_or_default(); if path.is_instance_of::() { let paths = path.extract::>()?; @@ -1077,7 +1166,8 @@ impl PySessionContext { } pub fn read_table(&self, table: Bound<'_, PyAny>) -> PyDataFusionResult { - let table = PyTable::new(&table)?; + let session = self.clone().into_bound_py_any(table.py())?; + let table = PyTable::new(table, Some(session))?; let df = self.ctx.read_table(table.table())?; Ok(PyDataFrame::new(df)) } @@ -1107,14 +1197,43 @@ impl PySessionContext { py: Python, ) -> PyDataFusionResult { let ctx: TaskContext = TaskContext::from(&self.ctx.state()); - // create a Tokio runtime to run the async code - let rt = &get_tokio_runtime().0; let plan = plan.plan.clone(); - let fut: JoinHandle> = - rt.spawn(async move { plan.execute(part, Arc::new(ctx)) }); - let stream = wait_for_future(py, async { fut.await.map_err(to_datafusion_err) })???; + let stream = spawn_future(py, async move { plan.execute(part, Arc::new(ctx)) })?; Ok(PyRecordBatchStream::new(stream)) } + + pub fn __datafusion_task_context_provider__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_task_context_provider".into(); + + let ctx_provider = Arc::clone(&self.ctx) as Arc; + let ffi_ctx_provider = FFI_TaskContextProvider::from(&ctx_provider); + + PyCapsule::new(py, ffi_ctx_provider, Some(name)) + } + + pub fn __datafusion_logical_extension_codec__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + create_logical_extension_capsule(py, self.logical_codec.as_ref()) + } + + pub fn with_logical_extension_codec<'py>( + &self, + codec: Bound<'py, PyAny>, + ) -> PyDataFusionResult { + let logical_codec = Arc::new(ffi_logical_codec_from_pycapsule(codec)?); + + Ok({ + Self { + ctx: Arc::clone(&self.ctx), + logical_codec, + } + }) + } } impl PySessionContext { @@ -1161,6 +1280,17 @@ impl PySessionContext { .register_table(TableReference::Bare { table: name.into() }, Arc::new(table))?; Ok(()) } + + fn default_logical_codec(ctx: &Arc) -> Arc { + let codec = Arc::new(DefaultLogicalExtensionCodec {}); + let runtime = get_tokio_runtime().handle().clone(); + let ctx_provider = Arc::clone(ctx) as Arc; + Arc::new(FFI_LogicalExtensionCodec::new( + codec, + Some(runtime), + &ctx_provider, + )) + } } pub fn parse_file_compression_type( @@ -1174,12 +1304,15 @@ pub fn parse_file_compression_type( impl From for SessionContext { fn from(ctx: PySessionContext) -> SessionContext { - ctx.ctx + ctx.ctx.as_ref().clone() } } impl From for PySessionContext { fn from(ctx: SessionContext) -> PySessionContext { - PySessionContext { ctx } + let ctx = Arc::new(ctx); + let logical_codec = Self::default_logical_codec(&ctx); + + PySessionContext { ctx, logical_codec } } } diff --git a/src/dataframe.rs b/crates/core/src/dataframe.rs similarity index 75% rename from src/dataframe.rs rename to crates/core/src/dataframe.rs index 1f87f99d9..29fc05ed3 100644 --- a/src/dataframe.rs +++ b/crates/core/src/dataframe.rs @@ -16,49 +16,52 @@ // under the License. use std::collections::HashMap; -use std::ffi::CString; +use std::ffi::{CStr, CString}; +use std::ptr::NonNull; +use std::str::FromStr; use std::sync::Arc; -use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader}; +use arrow::array::{Array, ArrayRef, RecordBatch, RecordBatchReader, new_null_array}; use arrow::compute::can_cast_types; use arrow::error::ArrowError; use arrow::ffi::FFI_ArrowSchema; use arrow::ffi_stream::FFI_ArrowArrayStream; use arrow::pyarrow::FromPyArrow; -use datafusion::arrow::datatypes::Schema; +use cstr::cstr; +use datafusion::arrow::datatypes::{Schema, SchemaRef}; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; +use datafusion::catalog::TableProvider; use datafusion::common::UnnestOptions; use datafusion::config::{CsvOptions, ParquetColumnOptions, ParquetOptions, TableParquetOptions}; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::error::DataFusionError; use datafusion::execution::SendableRecordBatchStream; -use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::SortExpr; +use datafusion::logical_expr::dml::InsertOp; use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; use datafusion::prelude::*; +use datafusion_python_util::{is_ipython_env, spawn_future, validate_pycapsule, wait_for_future}; use futures::{StreamExt, TryStreamExt}; +use parking_lot::Mutex; +use pyo3::PyErr; use pyo3::exceptions::PyValueError; +use pyo3::ffi::c_str; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3::types::{PyCapsule, PyList, PyTuple, PyTupleMethods}; -use tokio::task::JoinHandle; -use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionError}; -use crate::expr::sort_expr::to_sort_expressions; +use crate::common::data_type::PyScalarValue; +use crate::errors::{PyDataFusionError, PyDataFusionResult, py_datafusion_err}; +use crate::expr::PyExpr; +use crate::expr::sort_expr::{PySortExpr, to_sort_expressions}; use crate::physical_plan::PyExecutionPlan; -use crate::record_batch::PyRecordBatchStream; +use crate::record_batch::{PyRecordBatchStream, poll_next_batch}; use crate::sql::logical::PyLogicalPlan; -use crate::table::PyTable; -use crate::utils::{ - get_tokio_runtime, is_ipython_env, py_obj_to_scalar_value, validate_pycapsule, wait_for_future, -}; -use crate::{ - errors::PyDataFusionResult, - expr::{sort_expr::PySortExpr, PyExpr}, -}; +use crate::table::{PyTable, TempViewTable}; -use parking_lot::Mutex; +/// File-level static CStr for the Arrow array stream capsule name. +static ARROW_ARRAY_STREAM_NAME: &CStr = cstr!("arrow_array_stream"); // Type aliases to simplify very complex types used in this file and // avoid compiler complaints about deeply nested types in struct fields. @@ -70,18 +73,18 @@ type SharedCachedBatches = Arc>; pub struct FormatterConfig { /// Maximum memory in bytes to use for display (default: 2MB) pub max_bytes: usize, - /// Minimum number of rows to display (default: 20) + /// Minimum number of rows to display (default: 10) pub min_rows: usize, - /// Number of rows to include in __repr__ output (default: 10) - pub repr_rows: usize, + /// Maximum number of rows to include in __repr__ output (default: 10) + pub max_rows: usize, } impl Default for FormatterConfig { fn default() -> Self { Self { max_bytes: 2 * 1024 * 1024, // 2MB - min_rows: 20, - repr_rows: 10, + min_rows: 10, + max_rows: 10, } } } @@ -101,8 +104,12 @@ impl FormatterConfig { return Err("min_rows must be a positive integer".to_string()); } - if self.repr_rows == 0 { - return Err("repr_rows must be a positive integer".to_string()); + if self.max_rows == 0 { + return Err("max_rows must be a positive integer".to_string()); + } + + if self.min_rows > self.max_rows { + return Err("min_rows must be less than or equal to max_rows".to_string()); } Ok(()) @@ -134,11 +141,11 @@ fn import_python_formatter(py: Python<'_>) -> PyResult> { // Helper function to extract attributes with fallback to default fn get_attr<'a, T>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: T) -> T where - T: for<'py> pyo3::FromPyObject<'py> + Clone, + T: for<'py> pyo3::FromPyObject<'py, 'py> + Clone, { py_object .getattr(attr_name) - .and_then(|v| v.extract::()) + .and_then(|v| v.extract::().map_err(Into::::into)) .unwrap_or_else(|_| default_value.clone()) } @@ -146,13 +153,30 @@ where fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult { let default_config = FormatterConfig::default(); let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); - let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); - let repr_rows = get_attr(formatter, "repr_rows", default_config.repr_rows); + let min_rows = get_attr(formatter, "min_rows", default_config.min_rows); + + // Backward compatibility: Try max_rows first (new name), fall back to repr_rows (deprecated), + // then use default. This ensures backward compatibility with custom formatter implementations + // during the deprecation period. + let max_rows = get_attr(formatter, "max_rows", 0usize); + let max_rows = if max_rows > 0 { + // max_rows attribute exists and has a value + max_rows + } else { + // Try the deprecated repr_rows attribute + let repr_rows = get_attr(formatter, "repr_rows", 0usize); + if repr_rows > 0 { + repr_rows + } else { + // Use default + default_config.max_rows + } + }; let config = FormatterConfig { max_bytes, min_rows, - repr_rows, + max_rows, }; // Return the validated config, converting String error to PyErr @@ -161,7 +185,13 @@ fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult< } /// Python mapping of `ParquetOptions` (includes just the writer-related options). -#[pyclass(frozen, name = "ParquetWriterOptions", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "ParquetWriterOptions", + module = "datafusion", + subclass +)] #[derive(Clone, Default)] pub struct PyParquetWriterOptions { options: ParquetOptions, @@ -174,7 +204,7 @@ impl PyParquetWriterOptions { pub fn new( data_pagesize_limit: usize, write_batch_size: usize, - writer_version: String, + writer_version: &str, skip_arrow_metadata: bool, compression: Option, dictionary_enabled: Option, @@ -192,8 +222,11 @@ impl PyParquetWriterOptions { allow_single_file_parallelism: bool, maximum_parallel_row_group_writers: usize, maximum_buffered_record_batches_per_stream: usize, - ) -> Self { - Self { + ) -> PyResult { + let writer_version = + datafusion::common::parquet_config::DFParquetWriterVersion::from_str(writer_version) + .map_err(py_datafusion_err)?; + Ok(Self { options: ParquetOptions { data_pagesize_limit, write_batch_size, @@ -217,12 +250,18 @@ impl PyParquetWriterOptions { maximum_buffered_record_batches_per_stream, ..Default::default() }, - } + }) } } /// Python mapping of `ParquetColumnOptions`. -#[pyclass(frozen, name = "ParquetColumnOptions", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "ParquetColumnOptions", + module = "datafusion", + subclass +)] #[derive(Clone, Default)] pub struct PyParquetColumnOptions { options: ParquetColumnOptions, @@ -257,7 +296,13 @@ impl PyParquetColumnOptions { /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment. -#[pyclass(name = "DataFrame", module = "datafusion", subclass, frozen)] +#[pyclass( + from_py_object, + name = "DataFrame", + module = "datafusion", + subclass, + frozen +)] #[derive(Clone)] pub struct PyDataFrame { df: Arc, @@ -280,7 +325,11 @@ impl PyDataFrame { Arc::clone(&self.df) } - fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + fn prepare_repr_string<'py>( + &self, + py: Python<'py>, + as_html: bool, + ) -> PyDataFusionResult { // Get the Python formatter and config let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; @@ -308,11 +357,11 @@ impl PyDataFrame { let table_uuid = uuid::Uuid::new_v4().to_string(); - // Convert record batches to PyObject list + // Convert record batches to Py list let py_batches = batches .iter() .map(|rb| rb.to_pyarrow(py)) - .collect::>>()?; + .collect::>>>()?; let py_schema = self.schema().into_pyobject(py)?; @@ -338,6 +387,80 @@ impl PyDataFrame { Ok(html_str) } + + async fn collect_column_inner(&self, column: &str) -> Result { + let batches = self + .df + .as_ref() + .clone() + .select_columns(&[column])? + .collect() + .await?; + + let arrays = batches + .iter() + .map(|b| b.column(0).as_ref()) + .collect::>(); + + arrow_select::concat::concat(&arrays).map_err(Into::into) + } +} + +/// Synchronous wrapper around partitioned [`SendableRecordBatchStream`]s used +/// for the `__arrow_c_stream__` implementation. +/// +/// It drains each partition's stream sequentially, yielding record batches in +/// their original partition order. When a `projection` is set, each batch is +/// converted via `record_batch_into_schema` to apply schema changes per batch. +struct PartitionedDataFrameStreamReader { + streams: Vec, + schema: SchemaRef, + projection: Option, + current: usize, +} + +impl Iterator for PartitionedDataFrameStreamReader { + type Item = Result; + + fn next(&mut self) -> Option { + while self.current < self.streams.len() { + let stream = &mut self.streams[self.current]; + let fut = poll_next_batch(stream); + let result = Python::attach(|py| wait_for_future(py, fut)); + + match result { + Ok(Ok(Some(batch))) => { + let batch = if let Some(ref schema) = self.projection { + match record_batch_into_schema(batch, schema.as_ref()) { + Ok(b) => b, + Err(e) => return Some(Err(e)), + } + } else { + batch + }; + return Some(Ok(batch)); + } + Ok(Ok(None)) => { + self.current += 1; + continue; + } + Ok(Err(e)) => { + return Some(Err(ArrowError::ExternalError(Box::new(e)))); + } + Err(e) => { + return Some(Err(ArrowError::ExternalError(Box::new(e)))); + } + } + } + + None + } +} + +impl RecordBatchReader for PartitionedDataFrameStreamReader { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } } #[pymethods] @@ -347,7 +470,7 @@ impl PyDataFrame { if let Ok(key) = key.extract::() { // df[col] self.select_columns(vec![key]) - } else if let Ok(tuple) = key.downcast::() { + } else if let Ok(tuple) = key.cast::() { // df[col1, col2, col3] let keys = tuple .iter() @@ -411,7 +534,7 @@ impl PyDataFrame { /// Returns the schema from the logical plan fn schema(&self) -> PyArrowType { - PyArrowType(self.df.schema().into()) + PyArrowType(self.df.schema().as_arrow().clone()) } /// Convert this DataFrame into a Table Provider that can be used in register_table @@ -420,11 +543,15 @@ impl PyDataFrame { /// because we're working with Python bindings /// where objects are shared #[allow(clippy::wrong_self_convention)] - pub fn into_view(&self) -> PyDataFusionResult { - // Call the underlying Rust DataFrame::into_view method. - // Note that the Rust method consumes self; here we clone the inner Arc - // so that we don't invalidate this PyDataFrame. - let table_provider = self.df.as_ref().clone().into_view(); + pub fn into_view(&self, temporary: bool) -> PyDataFusionResult { + let table_provider = if temporary { + Arc::new(TempViewTable::new(Arc::clone(&self.df))) as Arc + } else { + // Call the underlying Rust DataFrame::into_view method. + // Note that the Rust method consumes self; here we clone the inner Arc + // so that we don't invalidate this PyDataFrame. + self.df.as_ref().clone().into_view() + }; Ok(PyTable::from(table_provider)) } @@ -465,7 +592,7 @@ impl PyDataFrame { self.df .as_ref() .parse_sql_expr(&expr) - .map(|e| PyExpr::from(e)) + .map(PyExpr::from) .map_err(PyDataFusionError::from) } @@ -518,7 +645,7 @@ impl PyDataFrame { /// Executes the plan, returning a list of `RecordBatch`es. /// Unless some order is specified in the plan, there is no /// guarantee of the order of the result. - fn collect(&self, py: Python) -> PyResult> { + fn collect<'py>(&self, py: Python<'py>) -> PyResult>> { let batches = wait_for_future(py, self.df.as_ref().clone().collect())? .map_err(PyDataFusionError::from)?; // cannot use PyResult> return type due to @@ -534,7 +661,7 @@ impl PyDataFrame { /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch /// maintaining the input partitioning. - fn collect_partitioned(&self, py: Python) -> PyResult>> { + fn collect_partitioned<'py>(&self, py: Python<'py>) -> PyResult>>> { let batches = wait_for_future(py, self.df.as_ref().clone().collect_partitioned())? .map_err(PyDataFusionError::from)?; @@ -544,6 +671,13 @@ impl PyDataFrame { .collect() } + fn collect_column<'py>(&self, py: Python<'py>, column: &str) -> PyResult> { + wait_for_future(py, self.collect_column_inner(column))? + .map_err(PyDataFusionError::from)? + .to_data() + .to_pyarrow(py) + } + /// Print the result, 20 lines by default #[pyo3(signature = (num=20))] fn show(&self, py: Python, num: usize) -> PyDataFusionResult<()> { @@ -563,6 +697,7 @@ impl PyDataFrame { how: &str, left_on: Vec, right_on: Vec, + coalesce_keys: bool, ) -> PyDataFusionResult { let join_type = match how { "inner" => JoinType::Inner, @@ -581,13 +716,62 @@ impl PyDataFrame { let left_keys = left_on.iter().map(|s| s.as_ref()).collect::>(); let right_keys = right_on.iter().map(|s| s.as_ref()).collect::>(); - let df = self.df.as_ref().clone().join( + let mut df = self.df.as_ref().clone().join( right.df.as_ref().clone(), join_type, &left_keys, &right_keys, None, )?; + + if coalesce_keys { + let mutual_keys = left_keys + .iter() + .zip(right_keys.iter()) + .filter(|(l, r)| l == r) + .map(|(key, _)| *key) + .collect::>(); + + let fields_to_coalesce = mutual_keys + .iter() + .map(|name| { + let qualified_fields = df + .logical_plan() + .schema() + .qualified_fields_with_unqualified_name(name); + (*name, qualified_fields) + }) + .filter(|(_, fields)| fields.len() == 2) + .collect::>(); + + let expr: Vec = df + .logical_plan() + .schema() + .fields() + .into_iter() + .enumerate() + .map(|(idx, _)| df.logical_plan().schema().qualified_field(idx)) + .filter_map(|(qualifier, field)| { + if let Some((key_name, qualified_fields)) = fields_to_coalesce + .iter() + .find(|(_, qf)| qf.contains(&(qualifier, field))) + { + // Only add the coalesce expression once (when we encounter the first field) + // Skip the second field (it's already included in to coalesce) + if (qualifier, field) == qualified_fields[0] { + let left_col = Expr::Column(Column::from(qualified_fields[0])); + let right_col = Expr::Column(Column::from(qualified_fields[1])); + return Some(coalesce(vec![left_col, right_col]).alias(*key_name)); + } + None + } else { + Some(Expr::Column(Column::from((qualifier, field)))) + } + }) + .collect(); + df = df.select(expr)?; + } + Ok(Self::new(df)) } @@ -902,14 +1086,21 @@ impl PyDataFrame { /// Convert to Arrow Table /// Collect the batches and pass to Arrow Table - fn to_arrow_table(&self, py: Python<'_>) -> PyResult { + fn to_arrow_table(&self, py: Python<'_>) -> PyResult> { let batches = self.collect(py)?.into_pyobject(py)?; - let schema = self.schema().into_pyobject(py)?; + + // only use the DataFrame's schema if there are no batches, otherwise let the schema be + // determined from the batches (avoids some inconsistencies with nullable columns) + let args = if batches.len()? == 0 { + let schema = self.schema().into_pyobject(py)?; + PyTuple::new(py, &[batches, schema])? + } else { + PyTuple::new(py, &[batches])? + }; // Instantiate pyarrow Table object and use its from_batches method let table_class = py.import("pyarrow")?.getattr("Table")?; - let args = PyTuple::new(py, &[batches, schema])?; - let table: PyObject = table_class.call_method1("from_batches", args)?.into(); + let table: Py = table_class.call_method1("from_batches", args)?.into(); Ok(table) } @@ -919,59 +1110,59 @@ impl PyDataFrame { py: Python<'py>, requested_schema: Option>, ) -> PyDataFusionResult> { - let mut batches = wait_for_future(py, self.df.as_ref().clone().collect())??; - let mut schema: Schema = self.df.schema().to_owned().into(); + let df = self.df.as_ref().clone(); + let streams = spawn_future(py, async move { df.execute_stream_partitioned().await })?; + + let mut schema: Schema = self.df.schema().to_owned().as_arrow().clone(); + let mut projection: Option = None; if let Some(schema_capsule) = requested_schema { validate_pycapsule(&schema_capsule, "arrow_schema")?; - let schema_ptr = unsafe { schema_capsule.reference::() }; + let data: NonNull = schema_capsule + .pointer_checked(Some(c_str!("arrow_schema")))? + .cast(); + let schema_ptr = unsafe { data.as_ref() }; let desired_schema = Schema::try_from(schema_ptr)?; schema = project_schema(schema, desired_schema)?; - - batches = batches - .into_iter() - .map(|record_batch| record_batch_into_schema(record_batch, &schema)) - .collect::, ArrowError>>()?; + projection = Some(Arc::new(schema.clone())); } - let batches_wrapped = batches.into_iter().map(Ok); + let schema_ref = Arc::new(schema.clone()); - let reader = RecordBatchIterator::new(batches_wrapped, Arc::new(schema)); + let reader = PartitionedDataFrameStreamReader { + streams, + schema: schema_ref, + projection, + current: 0, + }; let reader: Box = Box::new(reader); - let ffi_stream = FFI_ArrowArrayStream::new(reader); - let stream_capsule_name = CString::new("arrow_array_stream").unwrap(); - PyCapsule::new(py, ffi_stream, Some(stream_capsule_name)).map_err(PyDataFusionError::from) + // Create the Arrow stream and wrap it in a PyCapsule. The default + // destructor provided by PyO3 will drop the stream unless ownership is + // transferred to PyArrow during import. + let stream = FFI_ArrowArrayStream::new(reader); + let name = CString::new(ARROW_ARRAY_STREAM_NAME.to_bytes()).unwrap(); + let capsule = PyCapsule::new(py, stream, Some(name))?; + Ok(capsule) } fn execute_stream(&self, py: Python) -> PyDataFusionResult { - // create a Tokio runtime to run the async code - let rt = &get_tokio_runtime().0; let df = self.df.as_ref().clone(); - let fut: JoinHandle> = - rt.spawn(async move { df.execute_stream().await }); - let stream = wait_for_future(py, async { fut.await.map_err(to_datafusion_err) })???; + let stream = spawn_future(py, async move { df.execute_stream().await })?; Ok(PyRecordBatchStream::new(stream)) } fn execute_stream_partitioned(&self, py: Python) -> PyResult> { - // create a Tokio runtime to run the async code - let rt = &get_tokio_runtime().0; let df = self.df.as_ref().clone(); - let fut: JoinHandle>> = - rt.spawn(async move { df.execute_stream_partitioned().await }); - let stream = wait_for_future(py, async { fut.await.map_err(to_datafusion_err) })? - .map_err(py_datafusion_err)? - .map_err(py_datafusion_err)?; - - Ok(stream.into_iter().map(PyRecordBatchStream::new).collect()) + let streams = spawn_future(py, async move { df.execute_stream_partitioned().await })?; + Ok(streams.into_iter().map(PyRecordBatchStream::new).collect()) } /// Convert to pandas dataframe with pyarrow /// Collect the batches, pass to Arrow Table & then convert to Pandas DataFrame - fn to_pandas(&self, py: Python<'_>) -> PyResult { + fn to_pandas(&self, py: Python<'_>) -> PyResult> { let table = self.to_arrow_table(py)?; // See also: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas @@ -981,7 +1172,7 @@ impl PyDataFrame { /// Convert to Python list using pyarrow /// Each list item represents one row encoded as dictionary - fn to_pylist(&self, py: Python<'_>) -> PyResult { + fn to_pylist(&self, py: Python<'_>) -> PyResult> { let table = self.to_arrow_table(py)?; // See also: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pylist @@ -991,7 +1182,7 @@ impl PyDataFrame { /// Convert to Python dictionary using pyarrow /// Each dictionary key is a column and the dictionary value represents the column values - fn to_pydict(&self, py: Python) -> PyResult { + fn to_pydict(&self, py: Python) -> PyResult> { let table = self.to_arrow_table(py)?; // See also: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pydict @@ -1001,11 +1192,11 @@ impl PyDataFrame { /// Convert to polars dataframe with pyarrow /// Collect the batches, pass to Arrow Table & then convert to polars DataFrame - fn to_polars(&self, py: Python<'_>) -> PyResult { + fn to_polars(&self, py: Python<'_>) -> PyResult> { let table = self.to_arrow_table(py)?; let dataframe = py.import("polars")?.getattr("DataFrame")?; let args = PyTuple::new(py, &[table])?; - let result: PyObject = dataframe.call1(args)?.into(); + let result: Py = dataframe.call1(args)?.into(); Ok(result) } @@ -1018,24 +1209,31 @@ impl PyDataFrame { #[pyo3(signature = (value, columns=None))] fn fill_null( &self, - value: PyObject, + value: Py, columns: Option>, py: Python, ) -> PyDataFusionResult { - let scalar_value = py_obj_to_scalar_value(py, value)?; + let scalar_value: PyScalarValue = value.extract(py)?; let cols = match columns { Some(col_names) => col_names.iter().map(|c| c.to_string()).collect(), None => Vec::new(), // Empty vector means fill null for all columns }; - let df = self.df.as_ref().clone().fill_null(scalar_value, cols)?; + let df = self.df.as_ref().clone().fill_null(scalar_value.0, cols)?; Ok(Self::new(df)) } } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(frozen, eq, eq_int, name = "InsertOp", module = "datafusion")] +#[pyclass( + from_py_object, + frozen, + eq, + eq_int, + name = "InsertOp", + module = "datafusion" +)] pub enum PyInsertOp { APPEND, REPLACE, @@ -1053,7 +1251,12 @@ impl From for InsertOp { } #[derive(Debug, Clone)] -#[pyclass(frozen, name = "DataFrameWriteOptions", module = "datafusion")] +#[pyclass( + from_py_object, + frozen, + name = "DataFrameWriteOptions", + module = "datafusion" +)] pub struct PyDataFrameWriteOptions { insert_operation: InsertOp, single_file_output: bool, @@ -1127,7 +1330,11 @@ fn project_schema(from_schema: Schema, to_schema: Schema) -> Result { break; @@ -1212,11 +1425,14 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); + // When memory limit is exceeded, scale back row count proportionally to stay within budget if size_estimate_so_far > max_bytes { let ratio = max_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; + // Calculate reduced rows maintaining the memory/data proportion let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; + // Ensure we always respect the minimum rows guarantee if reduced_row_num < min_rows { reduced_row_num = min_rows.min(total_rows); } @@ -1229,8 +1445,8 @@ async fn collect_record_batches_to_display( } } - if rows_in_rb + rows_so_far > repr_rows { - rb = rb.slice(0, repr_rows - rows_so_far); + if rows_in_rb + rows_so_far > max_rows { + rb = rb.slice(0, max_rows - rows_so_far); has_more = true; } diff --git a/src/dataset.rs b/crates/core/src/dataset.rs similarity index 95% rename from src/dataset.rs rename to crates/core/src/dataset.rs index 0baf4da2a..dbeafcd9f 100644 --- a/src/dataset.rs +++ b/crates/core/src/dataset.rs @@ -15,25 +15,22 @@ // specific language governing permissions and limitations // under the License. -use datafusion::catalog::Session; -use pyo3::exceptions::PyValueError; -/// Implements a Datafusion TableProvider that delegates to a PyArrow Dataset -/// This allows us to use PyArrow Datasets as Datafusion tables while pushing down projections and filters -use pyo3::prelude::*; -use pyo3::types::PyType; - use std::any::Any; use std::sync::Arc; use async_trait::async_trait; - use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::pyarrow::PyArrowType; +use datafusion::catalog::Session; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::{DataFusionError, Result as DFResult}; -use datafusion::logical_expr::Expr; -use datafusion::logical_expr::TableProviderFilterPushDown; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::physical_plan::ExecutionPlan; +use pyo3::exceptions::PyValueError; +/// Implements a Datafusion TableProvider that delegates to a PyArrow Dataset +/// This allows us to use PyArrow Datasets as Datafusion tables while pushing down projections and filters +use pyo3::prelude::*; +use pyo3::types::PyType; use crate::dataset_exec::DatasetExec; use crate::pyarrow_filter_expression::PyArrowFilterExpression; @@ -41,7 +38,7 @@ use crate::pyarrow_filter_expression::PyArrowFilterExpression; // Wraps a pyarrow.dataset.Dataset class and implements a Datafusion TableProvider around it #[derive(Debug)] pub(crate) struct Dataset { - dataset: PyObject, + dataset: Py, } impl Dataset { @@ -50,7 +47,7 @@ impl Dataset { // Ensure that we were passed an instance of pyarrow.dataset.Dataset let ds = PyModule::import(py, "pyarrow.dataset")?; let ds_attr = ds.getattr("Dataset")?; - let ds_type = ds_attr.downcast::()?; + let ds_type = ds_attr.cast::()?; if dataset.is_instance(ds_type)? { Ok(Dataset { dataset: dataset.clone().unbind(), @@ -73,7 +70,7 @@ impl TableProvider for Dataset { /// Get a reference to the schema for this table fn schema(&self) -> SchemaRef { - Python::with_gil(|py| { + Python::attach(|py| { let dataset = self.dataset.bind(py); // This can panic but since we checked that self.dataset is a pyarrow.dataset.Dataset it should never Arc::new( @@ -107,7 +104,7 @@ impl TableProvider for Dataset { // The datasource should return *at least* this number of rows if available. _limit: Option, ) -> DFResult> { - Python::with_gil(|py| { + Python::attach(|py| { let plan: Arc = Arc::new( DatasetExec::new(py, self.dataset.bind(py), projection.cloned(), filters) .map_err(|err| DataFusionError::External(Box::new(err)))?, diff --git a/src/dataset_exec.rs b/crates/core/src/dataset_exec.rs similarity index 93% rename from src/dataset_exec.rs rename to crates/core/src/dataset_exec.rs index aab8d7566..e3c058c07 100644 --- a/src/dataset_exec.rs +++ b/crates/core/src/dataset_exec.rs @@ -15,32 +15,29 @@ // specific language governing permissions and limitations // under the License. -use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; -/// Implements a Datafusion physical ExecutionPlan that delegates to a PyArrow Dataset -/// This actually performs the projection, filtering and scanning of a Dataset -use pyo3::prelude::*; -use pyo3::types::{PyDict, PyIterator, PyList}; - use std::any::Any; use std::sync::Arc; -use futures::{stream, TryStreamExt}; - use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::error::ArrowError; -use datafusion::arrow::error::Result as ArrowResult; +use datafusion::arrow::error::{ArrowError, Result as ArrowResult}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::{DataFusionError as InnerDataFusionError, Result as DFResult}; use datafusion::execution::context::TaskContext; -use datafusion::logical_expr::utils::conjunction; use datafusion::logical_expr::Expr; +use datafusion::logical_expr::utils::conjunction; use datafusion::physical_expr::{EquivalenceProperties, LexOrdering}; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning, - SendableRecordBatchStream, Statistics, + PlanProperties, SendableRecordBatchStream, Statistics, }; +use futures::{TryStreamExt, stream}; +/// Implements a Datafusion physical ExecutionPlan that delegates to a PyArrow Dataset +/// This actually performs the projection, filtering and scanning of a Dataset +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyIterator, PyList}; use crate::errors::PyDataFusionResult; use crate::pyarrow_filter_expression::PyArrowFilterExpression; @@ -53,7 +50,7 @@ impl Iterator for PyArrowBatchesAdapter { type Item = ArrowResult; fn next(&mut self) -> Option { - Python::with_gil(|py| { + Python::attach(|py| { let mut batches = self.batches.clone_ref(py).into_bound(py); Some( batches @@ -68,13 +65,13 @@ impl Iterator for PyArrowBatchesAdapter { // Wraps a pyarrow.dataset.Dataset class and implements a Datafusion ExecutionPlan around it #[derive(Debug)] pub(crate) struct DatasetExec { - dataset: PyObject, + dataset: Py, schema: SchemaRef, fragments: Py, columns: Option>, - filter_expr: Option, + filter_expr: Option>, projected_statistics: Statistics, - plan_properties: datafusion::physical_plan::PlanProperties, + plan_properties: Arc, } impl DatasetExec { @@ -97,7 +94,7 @@ impl DatasetExec { .collect() }); let columns: Option> = columns.transpose()?; - let filter_expr: Option = conjunction(filters.to_owned()) + let filter_expr: Option> = conjunction(filters.to_owned()) .map(|filters| { PyArrowFilterExpression::try_from(&filters) .map(|filter_expr| filter_expr.inner().clone_ref(py)) @@ -131,15 +128,15 @@ impl DatasetExec { )?; let fragments_iter = pylist.call1((fragments_iterator,))?; - let fragments = fragments_iter.downcast::().map_err(PyErr::from)?; + let fragments = fragments_iter.cast::().map_err(PyErr::from)?; let projected_statistics = Statistics::new_unknown(&schema); - let plan_properties = datafusion::physical_plan::PlanProperties::new( + let plan_properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema.clone()), Partitioning::UnknownPartitioning(fragments.len()), EmissionType::Final, Boundedness::Bounded, - ); + )); Ok(DatasetExec { dataset: dataset.clone().unbind(), @@ -187,7 +184,7 @@ impl ExecutionPlan for DatasetExec { context: Arc, ) -> DFResult { let batch_size = context.session_config().batch_size(); - Python::with_gil(|py| { + Python::attach(|py| { let dataset = self.dataset.bind(py); let fragments = self.fragments.bind(py); let fragment = fragments @@ -238,11 +235,11 @@ impl ExecutionPlan for DatasetExec { }) } - fn statistics(&self) -> DFResult { + fn partition_statistics(&self, _partition: Option) -> DFResult { Ok(self.projected_statistics.clone()) } - fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + fn properties(&self) -> &Arc { &self.plan_properties } } @@ -272,7 +269,7 @@ impl ExecutionPlanProperties for DatasetExec { impl DisplayAs for DatasetExec { fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { - Python::with_gil(|py| { + Python::attach(|py| { let number_of_fragments = self.fragments.bind(py).len(); match t { DisplayFormatType::Default diff --git a/crates/core/src/errors.rs b/crates/core/src/errors.rs new file mode 100644 index 000000000..8babc5a56 --- /dev/null +++ b/crates/core/src/errors.rs @@ -0,0 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub use datafusion_python_util::errors::*; diff --git a/src/expr.rs b/crates/core/src/expr.rs similarity index 91% rename from src/expr.rs rename to crates/core/src/expr.rs index c9eddaa2d..c4f2a12da 100644 --- a/src/expr.rs +++ b/crates/core/src/expr.rs @@ -15,29 +15,36 @@ // specific language governing permissions and limitations // under the License. -use datafusion::logical_expr::expr::{AggregateFunctionParams, FieldMetadata}; -use datafusion::logical_expr::utils::exprlist_to_fields; -use datafusion::logical_expr::{ - lit_with_metadata, ExprFuncBuilder, ExprFunctionExt, LogicalPlan, WindowFunctionDefinition, -}; -use pyo3::IntoPyObjectExt; -use pyo3::{basic::CompareOp, prelude::*}; use std::collections::HashMap; use std::convert::{From, Into}; use std::sync::Arc; -use window::PyWindowFrame; use datafusion::arrow::datatypes::{DataType, Field}; use datafusion::arrow::pyarrow::PyArrowType; use datafusion::functions::core::expr_ext::FieldAccessor; +use datafusion::logical_expr::expr::{ + AggregateFunction, AggregateFunctionParams, FieldMetadata, InList, InSubquery, ScalarFunction, + SetComparison, WindowFunction, +}; +use datafusion::logical_expr::utils::exprlist_to_fields; use datafusion::logical_expr::{ - col, - expr::{AggregateFunction, InList, InSubquery, ScalarFunction, WindowFunction}, - lit, Between, BinaryExpr, Case, Cast, Expr, Like, Operator, TryCast, + Between, BinaryExpr, Case, Cast, Expr, ExprFuncBuilder, ExprFunctionExt, Like, LogicalPlan, + Operator, TryCast, WindowFunctionDefinition, col, lit, lit_with_metadata, }; +use pyo3::IntoPyObjectExt; +use pyo3::basic::CompareOp; +use pyo3::prelude::*; +use window::PyWindowFrame; +use self::alias::PyAlias; +use self::bool_expr::{ + PyIsFalse, PyIsNotFalse, PyIsNotNull, PyIsNotTrue, PyIsNotUnknown, PyIsNull, PyIsTrue, + PyIsUnknown, PyNegative, PyNot, +}; +use self::like::{PyILike, PyLike, PySimilarTo}; +use self::scalar_variable::PyScalarVariable; use crate::common::data_type::{DataTypeMap, NullTreatment, PyScalarValue, RexType}; -use crate::errors::{py_runtime_err, py_type_err, py_unsupported_variant_err, PyDataFusionResult}; +use crate::errors::{PyDataFusionResult, py_runtime_err, py_type_err, py_unsupported_variant_err}; use crate::expr::aggregate_expr::PyAggregateFunction; use crate::expr::binary_expr::PyBinaryExpr; use crate::expr::column::PyColumn; @@ -46,14 +53,6 @@ use crate::functions::add_builder_fns_to_window; use crate::pyarrow_util::scalar_to_pyarrow; use crate::sql::logical::PyLogicalPlan; -use self::alias::PyAlias; -use self::bool_expr::{ - PyIsFalse, PyIsNotFalse, PyIsNotNull, PyIsNotTrue, PyIsNotUnknown, PyIsNull, PyIsTrue, - PyIsUnknown, PyNegative, PyNot, -}; -use self::like::{PyILike, PyLike, PySimilarTo}; -use self::scalar_variable::PyScalarVariable; - pub mod aggregate; pub mod aggregate_expr; pub mod alias; @@ -99,6 +98,7 @@ pub mod recursive_query; pub mod repartition; pub mod scalar_subquery; pub mod scalar_variable; +pub mod set_comparison; pub mod signature; pub mod sort; pub mod sort_expr; @@ -112,10 +112,16 @@ pub mod unnest_expr; pub mod values; pub mod window; -use sort_expr::{to_sort_expressions, PySortExpr}; +use sort_expr::{PySortExpr, to_sort_expressions}; /// A PyExpr that can be used on a DataFrame -#[pyclass(frozen, name = "RawExpr", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "RawExpr", + module = "datafusion.expr", + subclass +)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, @@ -142,15 +148,18 @@ pub fn py_expr_list(expr: &[Expr]) -> PyResult> { impl PyExpr { /// Return the specific expression fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { - Python::with_gil(|_| { - match &self.expr { + Python::attach(|_| match &self.expr { Expr::Alias(alias) => Ok(PyAlias::from(alias.clone()).into_bound_py_any(py)?), Expr::Column(col) => Ok(PyColumn::from(col.clone()).into_bound_py_any(py)?), - Expr::ScalarVariable(data_type, variables) => { - Ok(PyScalarVariable::new(data_type, variables).into_bound_py_any(py)?) + Expr::ScalarVariable(field, variables) => { + Ok(PyScalarVariable::new(field, variables).into_bound_py_any(py)?) } Expr::Like(value) => Ok(PyLike::from(value.clone()).into_bound_py_any(py)?), - Expr::Literal(value, metadata) => Ok(PyLiteral::new_with_metadata(value.clone(), metadata.clone()).into_bound_py_any(py)?), + Expr::Literal(value, metadata) => Ok(PyLiteral::new_with_metadata( + value.clone(), + metadata.clone(), + ) + .into_bound_py_any(py)?), Expr::BinaryExpr(expr) => Ok(PyBinaryExpr::from(expr.clone()).into_bound_py_any(py)?), Expr::Not(expr) => Ok(PyNot::new(*expr.clone()).into_bound_py_any(py)?), Expr::IsNotNull(expr) => Ok(PyIsNotNull::new(*expr.clone()).into_bound_py_any(py)?), @@ -160,13 +169,17 @@ impl PyExpr { Expr::IsUnknown(expr) => Ok(PyIsUnknown::new(*expr.clone()).into_bound_py_any(py)?), Expr::IsNotTrue(expr) => Ok(PyIsNotTrue::new(*expr.clone()).into_bound_py_any(py)?), Expr::IsNotFalse(expr) => Ok(PyIsNotFalse::new(*expr.clone()).into_bound_py_any(py)?), - Expr::IsNotUnknown(expr) => Ok(PyIsNotUnknown::new(*expr.clone()).into_bound_py_any(py)?), + Expr::IsNotUnknown(expr) => { + Ok(PyIsNotUnknown::new(*expr.clone()).into_bound_py_any(py)?) + } Expr::Negative(expr) => Ok(PyNegative::new(*expr.clone()).into_bound_py_any(py)?), Expr::AggregateFunction(expr) => { Ok(PyAggregateFunction::from(expr.clone()).into_bound_py_any(py)?) } Expr::SimilarTo(value) => Ok(PySimilarTo::from(value.clone()).into_bound_py_any(py)?), - Expr::Between(value) => Ok(between::PyBetween::from(value.clone()).into_bound_py_any(py)?), + Expr::Between(value) => { + Ok(between::PyBetween::from(value.clone()).into_bound_py_any(py)?) + } Expr::Case(value) => Ok(case::PyCase::from(value.clone()).into_bound_py_any(py)?), Expr::Cast(value) => Ok(cast::PyCast::from(value.clone()).into_bound_py_any(py)?), Expr::TryCast(value) => Ok(cast::PyTryCast::from(value.clone()).into_bound_py_any(py)?), @@ -176,7 +189,9 @@ impl PyExpr { Expr::WindowFunction(value) => Err(py_unsupported_variant_err(format!( "Converting Expr::WindowFunction to a Python object is not implemented: {value:?}" ))), - Expr::InList(value) => Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?), + Expr::InList(value) => { + Ok(in_list::PyInList::from(value.clone()).into_bound_py_any(py)?) + } Expr::Exists(value) => Ok(exists::PyExists::from(value.clone()).into_bound_py_any(py)?), Expr::InSubquery(value) => { Ok(in_subquery::PyInSubquery::from(value.clone()).into_bound_py_any(py)?) @@ -194,11 +209,17 @@ impl PyExpr { Expr::Placeholder(value) => { Ok(placeholder::PyPlaceholder::from(value.clone()).into_bound_py_any(py)?) } - Expr::OuterReferenceColumn(data_type, column) => Err(py_unsupported_variant_err(format!( - "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {data_type:?} - {column:?}" - ))), - Expr::Unnest(value) => Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?), - } + Expr::OuterReferenceColumn(data_type, column) => { + Err(py_unsupported_variant_err(format!( + "Converting Expr::OuterReferenceColumn to a Python object is not implemented: {data_type:?} - {column:?}" + ))) + } + Expr::Unnest(value) => { + Ok(unnest_expr::PyUnnestExpr::from(value.clone()).into_bound_py_any(py)?) + } + Expr::SetComparison(value) => { + Ok(set_comparison::PySetComparison::from(value.clone()).into_bound_py_any(py)?) + } }) } @@ -365,11 +386,12 @@ impl PyExpr { | Expr::Placeholder { .. } | Expr::OuterReferenceColumn(_, _) | Expr::Unnest(_) - | Expr::IsNotUnknown(_) => RexType::Call, + | Expr::IsNotUnknown(_) + | Expr::SetComparison(_) => RexType::Call, Expr::ScalarSubquery(..) => RexType::ScalarSubquery, #[allow(deprecated)] Expr::Wildcard { .. } => { - return Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")) + return Err(py_unsupported_variant_err("Expr::Wildcard is unsupported")); } }) } @@ -380,8 +402,8 @@ impl PyExpr { Self::_types(&self.expr) } - /// Extracts the Expr value into a PyObject that can be shared with Python - pub fn python_value(&self, py: Python) -> PyResult { + /// Extracts the Expr value into a Py that can be shared with Python + pub fn python_value<'py>(&self, py: Python<'py>) -> PyResult> { match &self.expr { Expr::Literal(scalar_value, _) => scalar_to_pyarrow(scalar_value, py), _ => Err(py_type_err(format!( @@ -416,7 +438,10 @@ impl PyExpr { | Expr::Negative(expr) | Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) - | Expr::InSubquery(InSubquery { expr, .. }) => Ok(vec![PyExpr::from(*expr.clone())]), + | Expr::InSubquery(InSubquery { expr, .. }) + | Expr::SetComparison(SetComparison { expr, .. }) => { + Ok(vec![PyExpr::from(*expr.clone())]) + } // Expr variants containing a collection of Expr(s) for operands Expr::AggregateFunction(AggregateFunction { @@ -556,7 +581,7 @@ impl PyExpr { return Err(py_type_err(format!( "Catch all triggered in get_operator_name: {:?}", &self.expr - ))) + ))); } }) } @@ -637,7 +662,13 @@ impl PyExpr { } } -#[pyclass(frozen, name = "ExprFuncBuilder", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "ExprFuncBuilder", + module = "datafusion.expr", + subclass +)] #[derive(Debug, Clone)] pub struct PyExprFuncBuilder { pub builder: ExprFuncBuilder, @@ -748,7 +779,8 @@ impl PyExpr { | Operator::AtQuestion | Operator::Question | Operator::QuestionAnd - | Operator::QuestionPipe => Err(py_type_err(format!("Unsupported expr: ${op}"))), + | Operator::QuestionPipe + | Operator::Colon => Err(py_type_err(format!("Unsupported expr: ${op}"))), }, Expr::Cast(Cast { expr: _, data_type }) => DataTypeMap::map_from_arrow_type(data_type), Expr::Literal(scalar_value, _) => DataTypeMap::map_from_scalar_value(scalar_value), diff --git a/src/expr/aggregate.rs b/crates/core/src/expr/aggregate.rs similarity index 96% rename from src/expr/aggregate.rs rename to crates/core/src/expr/aggregate.rs index 4af7c755a..5a6a771a7 100644 --- a/src/expr/aggregate.rs +++ b/crates/core/src/expr/aggregate.rs @@ -15,12 +15,14 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; + use datafusion::common::DataFusionError; +use datafusion::logical_expr::Expr; use datafusion::logical_expr::expr::{AggregateFunction, AggregateFunctionParams, Alias}; use datafusion::logical_expr::logical_plan::Aggregate; -use datafusion::logical_expr::Expr; -use pyo3::{prelude::*, IntoPyObjectExt}; -use std::fmt::{self, Display, Formatter}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; use crate::common::df_schema::PyDFSchema; @@ -28,7 +30,13 @@ use crate::errors::py_type_err; use crate::expr::PyExpr; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Aggregate", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Aggregate", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyAggregate { aggregate: Aggregate, diff --git a/src/expr/aggregate_expr.rs b/crates/core/src/expr/aggregate_expr.rs similarity index 99% rename from src/expr/aggregate_expr.rs rename to crates/core/src/expr/aggregate_expr.rs index 72ba0638f..88e47999f 100644 --- a/src/expr/aggregate_expr.rs +++ b/crates/core/src/expr/aggregate_expr.rs @@ -15,12 +15,15 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; +use std::fmt::{Display, Formatter}; + use datafusion::logical_expr::expr::AggregateFunction; use pyo3::prelude::*; -use std::fmt::{Display, Formatter}; + +use crate::expr::PyExpr; #[pyclass( + from_py_object, frozen, name = "AggregateFunction", module = "datafusion.expr", diff --git a/src/expr/alias.rs b/crates/core/src/expr/alias.rs similarity index 94% rename from src/expr/alias.rs rename to crates/core/src/expr/alias.rs index 588c00fdf..b76e82e22 100644 --- a/src/expr/alias.rs +++ b/crates/core/src/expr/alias.rs @@ -15,13 +15,20 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; -use pyo3::prelude::*; use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::expr::Alias; +use pyo3::prelude::*; + +use crate::expr::PyExpr; -#[pyclass(frozen, name = "Alias", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Alias", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyAlias { alias: Alias, diff --git a/src/expr/analyze.rs b/crates/core/src/expr/analyze.rs similarity index 93% rename from src/expr/analyze.rs rename to crates/core/src/expr/analyze.rs index c7caeebc8..137765fe1 100644 --- a/src/expr/analyze.rs +++ b/crates/core/src/expr/analyze.rs @@ -15,15 +15,23 @@ // specific language governing permissions and limitations // under the License. -use datafusion::logical_expr::logical_plan::Analyze; -use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; +use datafusion::logical_expr::logical_plan::Analyze; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use super::logical_node::LogicalNode; use crate::common::df_schema::PyDFSchema; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Analyze", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Analyze", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyAnalyze { analyze: Analyze, diff --git a/src/expr/between.rs b/crates/core/src/expr/between.rs similarity index 94% rename from src/expr/between.rs rename to crates/core/src/expr/between.rs index 1f61599a3..6943b6c3b 100644 --- a/src/expr/between.rs +++ b/crates/core/src/expr/between.rs @@ -15,12 +15,20 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; +use std::fmt::{self, Display, Formatter}; + use datafusion::logical_expr::expr::Between; use pyo3::prelude::*; -use std::fmt::{self, Display, Formatter}; -#[pyclass(frozen, name = "Between", module = "datafusion.expr", subclass)] +use crate::expr::PyExpr; + +#[pyclass( + from_py_object, + frozen, + name = "Between", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyBetween { between: Between, diff --git a/src/expr/binary_expr.rs b/crates/core/src/expr/binary_expr.rs similarity index 93% rename from src/expr/binary_expr.rs rename to crates/core/src/expr/binary_expr.rs index 94379583c..2326ba705 100644 --- a/src/expr/binary_expr.rs +++ b/crates/core/src/expr/binary_expr.rs @@ -15,11 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; use datafusion::logical_expr::BinaryExpr; use pyo3::prelude::*; -#[pyclass(frozen, name = "BinaryExpr", module = "datafusion.expr", subclass)] +use crate::expr::PyExpr; + +#[pyclass( + from_py_object, + frozen, + name = "BinaryExpr", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyBinaryExpr { expr: BinaryExpr, diff --git a/src/expr/bool_expr.rs b/crates/core/src/expr/bool_expr.rs similarity index 83% rename from src/expr/bool_expr.rs rename to crates/core/src/expr/bool_expr.rs index 0d2b051e6..9e374c7e2 100644 --- a/src/expr/bool_expr.rs +++ b/crates/core/src/expr/bool_expr.rs @@ -15,13 +15,20 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; + use datafusion::logical_expr::Expr; use pyo3::prelude::*; -use std::fmt::{self, Display, Formatter}; use super::PyExpr; -#[pyclass(frozen, name = "Not", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Not", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyNot { expr: Expr, @@ -51,7 +58,13 @@ impl PyNot { } } -#[pyclass(frozen, name = "IsNotNull", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsNotNull", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsNotNull { expr: Expr, @@ -81,7 +94,13 @@ impl PyIsNotNull { } } -#[pyclass(frozen, name = "IsNull", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsNull", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsNull { expr: Expr, @@ -111,7 +130,13 @@ impl PyIsNull { } } -#[pyclass(frozen, name = "IsTrue", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsTrue", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsTrue { expr: Expr, @@ -141,7 +166,13 @@ impl PyIsTrue { } } -#[pyclass(frozen, name = "IsFalse", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsFalse", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsFalse { expr: Expr, @@ -171,7 +202,13 @@ impl PyIsFalse { } } -#[pyclass(frozen, name = "IsUnknown", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsUnknown", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsUnknown { expr: Expr, @@ -201,7 +238,13 @@ impl PyIsUnknown { } } -#[pyclass(frozen, name = "IsNotTrue", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsNotTrue", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsNotTrue { expr: Expr, @@ -231,7 +274,13 @@ impl PyIsNotTrue { } } -#[pyclass(frozen, name = "IsNotFalse", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsNotFalse", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsNotFalse { expr: Expr, @@ -261,7 +310,13 @@ impl PyIsNotFalse { } } -#[pyclass(frozen, name = "IsNotUnknown", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "IsNotUnknown", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyIsNotUnknown { expr: Expr, @@ -291,7 +346,13 @@ impl PyIsNotUnknown { } } -#[pyclass(frozen, name = "Negative", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Negative", + module = "datafusion.expr", + subclass +)] #[derive(Clone, Debug)] pub struct PyNegative { expr: Expr, diff --git a/src/expr/case.rs b/crates/core/src/expr/case.rs similarity index 93% rename from src/expr/case.rs rename to crates/core/src/expr/case.rs index 1a7369826..4f00449d8 100644 --- a/src/expr/case.rs +++ b/crates/core/src/expr/case.rs @@ -15,11 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; use datafusion::logical_expr::Case; use pyo3::prelude::*; -#[pyclass(frozen, name = "Case", module = "datafusion.expr", subclass)] +use crate::expr::PyExpr; + +#[pyclass( + from_py_object, + frozen, + name = "Case", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyCase { case: Case, diff --git a/src/expr/cast.rs b/crates/core/src/expr/cast.rs similarity index 87% rename from src/expr/cast.rs rename to crates/core/src/expr/cast.rs index 03e2b8476..37d603538 100644 --- a/src/expr/cast.rs +++ b/crates/core/src/expr/cast.rs @@ -15,11 +15,19 @@ // specific language governing permissions and limitations // under the License. -use crate::{common::data_type::PyDataType, expr::PyExpr}; use datafusion::logical_expr::{Cast, TryCast}; use pyo3::prelude::*; -#[pyclass(frozen, name = "Cast", module = "datafusion.expr", subclass)] +use crate::common::data_type::PyDataType; +use crate::expr::PyExpr; + +#[pyclass( + from_py_object, + frozen, + name = "Cast", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyCast { cast: Cast, @@ -48,7 +56,7 @@ impl PyCast { } } -#[pyclass(name = "TryCast", module = "datafusion.expr", subclass)] +#[pyclass(from_py_object, name = "TryCast", module = "datafusion.expr", subclass)] #[derive(Clone)] pub struct PyTryCast { try_cast: TryCast, diff --git a/src/expr/column.rs b/crates/core/src/expr/column.rs similarity index 93% rename from src/expr/column.rs rename to crates/core/src/expr/column.rs index 300079481..c1238f98a 100644 --- a/src/expr/column.rs +++ b/crates/core/src/expr/column.rs @@ -18,7 +18,13 @@ use datafusion::common::Column; use pyo3::prelude::*; -#[pyclass(frozen, name = "Column", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Column", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyColumn { pub col: Column, diff --git a/src/expr/conditional_expr.rs b/crates/core/src/expr/conditional_expr.rs similarity index 92% rename from src/expr/conditional_expr.rs rename to crates/core/src/expr/conditional_expr.rs index 21f538ba0..ea21fdb20 100644 --- a/src/expr/conditional_expr.rs +++ b/crates/core/src/expr/conditional_expr.rs @@ -15,14 +15,22 @@ // specific language governing permissions and limitations // under the License. -use crate::{errors::PyDataFusionResult, expr::PyExpr}; use datafusion::logical_expr::conditional_expressions::CaseBuilder; use datafusion::prelude::Expr; use pyo3::prelude::*; +use crate::errors::PyDataFusionResult; +use crate::expr::PyExpr; + // TODO(tsaucer) replace this all with CaseBuilder after it implements Clone #[derive(Clone, Debug)] -#[pyclass(name = "CaseBuilder", module = "datafusion.expr", subclass, frozen)] +#[pyclass( + from_py_object, + name = "CaseBuilder", + module = "datafusion.expr", + subclass, + frozen +)] pub struct PyCaseBuilder { expr: Option, when: Vec, diff --git a/src/expr/copy_to.rs b/crates/core/src/expr/copy_to.rs similarity index 87% rename from src/expr/copy_to.rs rename to crates/core/src/expr/copy_to.rs index 422ab77f4..78e53cdff 100644 --- a/src/expr/copy_to.rs +++ b/crates/core/src/expr/copy_to.rs @@ -15,20 +15,25 @@ // specific language governing permissions and limitations // under the License. -use std::{ - collections::HashMap, - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::collections::HashMap; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; -use datafusion::{common::file_options::file_type::FileType, logical_expr::dml::CopyTo}; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::sql::logical::PyLogicalPlan; +use datafusion::common::file_options::file_type::FileType; +use datafusion::logical_expr::dml::CopyTo; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "CopyTo", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "CopyTo", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyCopyTo { copy: CopyTo, @@ -114,7 +119,13 @@ impl PyCopyTo { } } -#[pyclass(frozen, name = "FileType", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "FileType", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyFileType { file_type: Arc, diff --git a/src/expr/create_catalog.rs b/crates/core/src/expr/create_catalog.rs similarity index 88% rename from src/expr/create_catalog.rs rename to crates/core/src/expr/create_catalog.rs index 361387894..fa95980c0 100644 --- a/src/expr/create_catalog.rs +++ b/crates/core/src/expr/create_catalog.rs @@ -15,19 +15,24 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use datafusion::logical_expr::CreateCatalog; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; - -#[pyclass(frozen, name = "CreateCatalog", module = "datafusion.expr", subclass)] +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass( + from_py_object, + frozen, + name = "CreateCatalog", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyCreateCatalog { create: CreateCatalog, diff --git a/src/expr/create_catalog_schema.rs b/crates/core/src/expr/create_catalog_schema.rs similarity index 92% rename from src/expr/create_catalog_schema.rs rename to crates/core/src/expr/create_catalog_schema.rs index cb3be2d30..d836284a0 100644 --- a/src/expr/create_catalog_schema.rs +++ b/crates/core/src/expr/create_catalog_schema.rs @@ -15,19 +15,19 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use datafusion::logical_expr::CreateCatalogSchema; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; #[pyclass( + from_py_object, frozen, name = "CreateCatalogSchema", module = "datafusion.expr", diff --git a/src/expr/create_external_table.rs b/crates/core/src/expr/create_external_table.rs similarity index 90% rename from src/expr/create_external_table.rs rename to crates/core/src/expr/create_external_table.rs index 920d0d613..980eea131 100644 --- a/src/expr/create_external_table.rs +++ b/crates/core/src/expr/create_external_table.rs @@ -15,21 +15,23 @@ // specific language governing permissions and limitations // under the License. -use crate::{common::schema::PyConstraints, expr::PyExpr, sql::logical::PyLogicalPlan}; -use std::{ - collections::HashMap, - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::collections::HashMap; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use datafusion::logical_expr::CreateExternalTable; -use pyo3::{prelude::*, IntoPyObjectExt}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; +use super::logical_node::LogicalNode; +use super::sort_expr::PySortExpr; use crate::common::df_schema::PyDFSchema; - -use super::{logical_node::LogicalNode, sort_expr::PySortExpr}; +use crate::common::schema::PyConstraints; +use crate::expr::PyExpr; +use crate::sql::logical::PyLogicalPlan; #[pyclass( + from_py_object, frozen, name = "CreateExternalTable", module = "datafusion.expr", @@ -66,7 +68,7 @@ impl Display for PyCreateExternalTable { impl PyCreateExternalTable { #[allow(clippy::too_many_arguments)] #[new] - #[pyo3(signature = (schema, name, location, file_type, table_partition_cols, if_not_exists, temporary, order_exprs, unbounded, options, constraints, column_defaults, definition=None))] + #[pyo3(signature = (schema, name, location, file_type, table_partition_cols, if_not_exists, or_replace, temporary, order_exprs, unbounded, options, constraints, column_defaults, definition=None))] pub fn new( schema: PyDFSchema, name: String, @@ -74,6 +76,7 @@ impl PyCreateExternalTable { file_type: String, table_partition_cols: Vec, if_not_exists: bool, + or_replace: bool, temporary: bool, order_exprs: Vec>, unbounded: bool, @@ -89,6 +92,7 @@ impl PyCreateExternalTable { file_type, table_partition_cols, if_not_exists, + or_replace, temporary, definition, order_exprs: order_exprs diff --git a/src/expr/create_function.rs b/crates/core/src/expr/create_function.rs similarity index 91% rename from src/expr/create_function.rs rename to crates/core/src/expr/create_function.rs index 1b663b466..622858913 100644 --- a/src/expr/create_function.rs +++ b/crates/core/src/expr/create_function.rs @@ -15,22 +15,28 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use datafusion::logical_expr::{ CreateFunction, CreateFunctionBody, OperateFunctionArg, Volatility, }; -use pyo3::{prelude::*, IntoPyObjectExt}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; -use super::logical_node::LogicalNode; use super::PyExpr; -use crate::common::{data_type::PyDataType, df_schema::PyDFSchema}; +use super::logical_node::LogicalNode; +use crate::common::data_type::PyDataType; +use crate::common::df_schema::PyDFSchema; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "CreateFunction", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "CreateFunction", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyCreateFunction { create: CreateFunction, @@ -55,6 +61,7 @@ impl Display for PyCreateFunction { } #[pyclass( + from_py_object, frozen, name = "OperateFunctionArg", module = "datafusion.expr", @@ -66,7 +73,14 @@ pub struct PyOperateFunctionArg { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(frozen, eq, eq_int, name = "Volatility", module = "datafusion.expr")] +#[pyclass( + from_py_object, + frozen, + eq, + eq_int, + name = "Volatility", + module = "datafusion.expr" +)] pub enum PyVolatility { Immutable, Stable, @@ -74,6 +88,7 @@ pub enum PyVolatility { } #[pyclass( + from_py_object, frozen, name = "CreateFunctionBody", module = "datafusion.expr", diff --git a/src/expr/create_index.rs b/crates/core/src/expr/create_index.rs similarity index 89% rename from src/expr/create_index.rs rename to crates/core/src/expr/create_index.rs index 7b68df305..5f9bd11e8 100644 --- a/src/expr/create_index.rs +++ b/crates/core/src/expr/create_index.rs @@ -15,19 +15,25 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use datafusion::logical_expr::CreateIndex; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; - -use super::{logical_node::LogicalNode, sort_expr::PySortExpr}; - -#[pyclass(frozen, name = "CreateIndex", module = "datafusion.expr", subclass)] +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + +use super::logical_node::LogicalNode; +use super::sort_expr::PySortExpr; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass( + from_py_object, + frozen, + name = "CreateIndex", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyCreateIndex { create: CreateIndex, diff --git a/src/expr/create_memory_table.rs b/crates/core/src/expr/create_memory_table.rs similarity index 97% rename from src/expr/create_memory_table.rs rename to crates/core/src/expr/create_memory_table.rs index 15aaa810b..3214dab0e 100644 --- a/src/expr/create_memory_table.rs +++ b/crates/core/src/expr/create_memory_table.rs @@ -18,13 +18,14 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::CreateMemoryTable; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::sql::logical::PyLogicalPlan; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; #[pyclass( + from_py_object, frozen, name = "CreateMemoryTable", module = "datafusion.expr", diff --git a/src/expr/create_view.rs b/crates/core/src/expr/create_view.rs similarity index 92% rename from src/expr/create_view.rs rename to crates/core/src/expr/create_view.rs index 49b3b6199..6941ef769 100644 --- a/src/expr/create_view.rs +++ b/crates/core/src/expr/create_view.rs @@ -18,13 +18,20 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::{CreateView, DdlStatement, LogicalPlan}; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; - -#[pyclass(frozen, name = "CreateView", module = "datafusion.expr", subclass)] +use crate::errors::py_type_err; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass( + from_py_object, + frozen, + name = "CreateView", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyCreateView { create: CreateView, diff --git a/src/expr/describe_table.rs b/crates/core/src/expr/describe_table.rs similarity index 86% rename from src/expr/describe_table.rs rename to crates/core/src/expr/describe_table.rs index 315026fef..73955bb34 100644 --- a/src/expr/describe_table.rs +++ b/crates/core/src/expr/describe_table.rs @@ -15,20 +15,26 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; -use arrow::{datatypes::Schema, pyarrow::PyArrowType}; +use arrow::datatypes::Schema; +use arrow::pyarrow::PyArrowType; use datafusion::logical_expr::DescribeTable; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "DescribeTable", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "DescribeTable", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyDescribeTable { describe: DescribeTable, diff --git a/src/expr/distinct.rs b/crates/core/src/expr/distinct.rs similarity index 94% rename from src/expr/distinct.rs rename to crates/core/src/expr/distinct.rs index 5770b849d..68c2a17fe 100644 --- a/src/expr/distinct.rs +++ b/crates/core/src/expr/distinct.rs @@ -18,13 +18,19 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Distinct; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::sql::logical::PyLogicalPlan; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Distinct", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Distinct", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyDistinct { distinct: Distinct, diff --git a/src/expr/dml.rs b/crates/core/src/expr/dml.rs similarity index 88% rename from src/expr/dml.rs rename to crates/core/src/expr/dml.rs index 4437a9de9..26f975820 100644 --- a/src/expr/dml.rs +++ b/crates/core/src/expr/dml.rs @@ -17,14 +17,21 @@ use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::{DmlStatement, WriteOp}; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::common::schema::PyTableSource; -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; - -#[pyclass(frozen, name = "DmlStatement", module = "datafusion.expr", subclass)] +use crate::common::df_schema::PyDFSchema; +use crate::common::schema::PyTableSource; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass( + from_py_object, + frozen, + name = "DmlStatement", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyDmlStatement { dml: DmlStatement, @@ -88,15 +95,21 @@ impl PyDmlStatement { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(eq, eq_int, name = "WriteOp", module = "datafusion.expr")] +#[pyclass( + from_py_object, + eq, + eq_int, + name = "WriteOp", + module = "datafusion.expr" +)] pub enum PyWriteOp { Append, Overwrite, Replace, - Update, Delete, Ctas, + Truncate, } impl From for PyWriteOp { @@ -105,10 +118,10 @@ impl From for PyWriteOp { WriteOp::Insert(InsertOp::Append) => PyWriteOp::Append, WriteOp::Insert(InsertOp::Overwrite) => PyWriteOp::Overwrite, WriteOp::Insert(InsertOp::Replace) => PyWriteOp::Replace, - WriteOp::Update => PyWriteOp::Update, WriteOp::Delete => PyWriteOp::Delete, WriteOp::Ctas => PyWriteOp::Ctas, + WriteOp::Truncate => PyWriteOp::Truncate, } } } @@ -119,10 +132,10 @@ impl From for WriteOp { PyWriteOp::Append => WriteOp::Insert(InsertOp::Append), PyWriteOp::Overwrite => WriteOp::Insert(InsertOp::Overwrite), PyWriteOp::Replace => WriteOp::Insert(InsertOp::Replace), - PyWriteOp::Update => WriteOp::Update, PyWriteOp::Delete => WriteOp::Delete, PyWriteOp::Ctas => WriteOp::Ctas, + PyWriteOp::Truncate => WriteOp::Truncate, } } } diff --git a/src/expr/drop_catalog_schema.rs b/crates/core/src/expr/drop_catalog_schema.rs similarity index 91% rename from src/expr/drop_catalog_schema.rs rename to crates/core/src/expr/drop_catalog_schema.rs index 7008bcd24..fd5105332 100644 --- a/src/expr/drop_catalog_schema.rs +++ b/crates/core/src/expr/drop_catalog_schema.rs @@ -15,20 +15,22 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; -use datafusion::{common::SchemaReference, logical_expr::DropCatalogSchema, sql::TableReference}; -use pyo3::{exceptions::PyValueError, prelude::*, IntoPyObjectExt}; - -use crate::common::df_schema::PyDFSchema; +use datafusion::common::SchemaReference; +use datafusion::logical_expr::DropCatalogSchema; +use datafusion::sql::TableReference; +use pyo3::IntoPyObjectExt; +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; use crate::sql::logical::PyLogicalPlan; #[pyclass( + from_py_object, frozen, name = "DropCatalogSchema", module = "datafusion.expr", diff --git a/src/expr/drop_function.rs b/crates/core/src/expr/drop_function.rs similarity index 91% rename from src/expr/drop_function.rs rename to crates/core/src/expr/drop_function.rs index 42ad3e1fe..0599dd49e 100644 --- a/src/expr/drop_function.rs +++ b/crates/core/src/expr/drop_function.rs @@ -15,19 +15,24 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use datafusion::logical_expr::DropFunction; -use pyo3::{prelude::*, IntoPyObjectExt}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; use crate::common::df_schema::PyDFSchema; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "DropFunction", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "DropFunction", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyDropFunction { drop: DropFunction, diff --git a/src/expr/drop_table.rs b/crates/core/src/expr/drop_table.rs similarity index 93% rename from src/expr/drop_table.rs rename to crates/core/src/expr/drop_table.rs index 6ff4f01c4..46fe67465 100644 --- a/src/expr/drop_table.rs +++ b/crates/core/src/expr/drop_table.rs @@ -18,13 +18,19 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::logical_plan::DropTable; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::sql::logical::PyLogicalPlan; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "DropTable", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "DropTable", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyDropTable { drop: DropTable, diff --git a/src/expr/drop_view.rs b/crates/core/src/expr/drop_view.rs similarity index 91% rename from src/expr/drop_view.rs rename to crates/core/src/expr/drop_view.rs index b2aff4e9b..0d0c51f13 100644 --- a/src/expr/drop_view.rs +++ b/crates/core/src/expr/drop_view.rs @@ -15,20 +15,24 @@ // specific language governing permissions and limitations // under the License. -use std::{ - fmt::{self, Display, Formatter}, - sync::Arc, -}; +use std::fmt::{self, Display, Formatter}; +use std::sync::Arc; use datafusion::logical_expr::DropView; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::common::df_schema::PyDFSchema; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "DropView", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "DropView", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyDropView { drop: DropView, diff --git a/src/expr/empty_relation.rs b/crates/core/src/expr/empty_relation.rs similarity index 90% rename from src/expr/empty_relation.rs rename to crates/core/src/expr/empty_relation.rs index 797a8c02d..f3c237731 100644 --- a/src/expr/empty_relation.rs +++ b/crates/core/src/expr/empty_relation.rs @@ -15,14 +15,23 @@ // specific language governing permissions and limitations // under the License. -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; -use datafusion::logical_expr::EmptyRelation; -use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; +use datafusion::logical_expr::EmptyRelation; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "EmptyRelation", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "EmptyRelation", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyEmptyRelation { empty: EmptyRelation, diff --git a/src/expr/exists.rs b/crates/core/src/expr/exists.rs similarity index 91% rename from src/expr/exists.rs rename to crates/core/src/expr/exists.rs index 392bfcb9e..d2e816127 100644 --- a/src/expr/exists.rs +++ b/crates/core/src/expr/exists.rs @@ -20,7 +20,13 @@ use pyo3::prelude::*; use super::subquery::PySubquery; -#[pyclass(frozen, name = "Exists", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Exists", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyExists { exists: Exists, diff --git a/src/expr/explain.rs b/crates/core/src/expr/explain.rs similarity index 88% rename from src/expr/explain.rs rename to crates/core/src/expr/explain.rs index 71b7b2c13..6259951de 100644 --- a/src/expr/explain.rs +++ b/crates/core/src/expr/explain.rs @@ -17,14 +17,23 @@ use std::fmt::{self, Display, Formatter}; -use datafusion::logical_expr::{logical_plan::Explain, LogicalPlan}; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{common::df_schema::PyDFSchema, errors::py_type_err, sql::logical::PyLogicalPlan}; +use datafusion::logical_expr::LogicalPlan; +use datafusion::logical_expr::logical_plan::Explain; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; - -#[pyclass(frozen, name = "Explain", module = "datafusion.expr", subclass)] +use crate::common::df_schema::PyDFSchema; +use crate::errors::py_type_err; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass( + from_py_object, + frozen, + name = "Explain", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyExplain { explain: Explain, diff --git a/src/expr/extension.rs b/crates/core/src/expr/extension.rs similarity index 90% rename from src/expr/extension.rs rename to crates/core/src/expr/extension.rs index 7d913ff8c..a0b617565 100644 --- a/src/expr/extension.rs +++ b/crates/core/src/expr/extension.rs @@ -16,13 +16,19 @@ // under the License. use datafusion::logical_expr::Extension; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::sql::logical::PyLogicalPlan; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Extension", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Extension", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyExtension { pub node: Extension, diff --git a/src/expr/filter.rs b/crates/core/src/expr/filter.rs similarity index 93% rename from src/expr/filter.rs rename to crates/core/src/expr/filter.rs index 76338d139..67426806d 100644 --- a/src/expr/filter.rs +++ b/crates/core/src/expr/filter.rs @@ -15,16 +15,24 @@ // specific language governing permissions and limitations // under the License. -use datafusion::logical_expr::logical_plan::Filter; -use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; +use datafusion::logical_expr::logical_plan::Filter; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use crate::common::df_schema::PyDFSchema; -use crate::expr::logical_node::LogicalNode; use crate::expr::PyExpr; +use crate::expr::logical_node::LogicalNode; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Filter", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Filter", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyFilter { filter: Filter, diff --git a/src/expr/grouping_set.rs b/crates/core/src/expr/grouping_set.rs similarity index 91% rename from src/expr/grouping_set.rs rename to crates/core/src/expr/grouping_set.rs index 107dd9370..549a866ed 100644 --- a/src/expr/grouping_set.rs +++ b/crates/core/src/expr/grouping_set.rs @@ -18,7 +18,13 @@ use datafusion::logical_expr::GroupingSet; use pyo3::prelude::*; -#[pyclass(frozen, name = "GroupingSet", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "GroupingSet", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyGroupingSet { grouping_set: GroupingSet, diff --git a/src/expr/in_list.rs b/crates/core/src/expr/in_list.rs similarity index 92% rename from src/expr/in_list.rs rename to crates/core/src/expr/in_list.rs index e2e6d7832..0612cc21e 100644 --- a/src/expr/in_list.rs +++ b/crates/core/src/expr/in_list.rs @@ -15,11 +15,18 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; use datafusion::logical_expr::expr::InList; use pyo3::prelude::*; -#[pyclass(frozen, name = "InList", module = "datafusion.expr", subclass)] +use crate::expr::PyExpr; + +#[pyclass( + from_py_object, + frozen, + name = "InList", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyInList { in_list: InList, diff --git a/src/expr/in_subquery.rs b/crates/core/src/expr/in_subquery.rs similarity index 89% rename from src/expr/in_subquery.rs rename to crates/core/src/expr/in_subquery.rs index 6d4a38e49..81a2c5794 100644 --- a/src/expr/in_subquery.rs +++ b/crates/core/src/expr/in_subquery.rs @@ -18,9 +18,16 @@ use datafusion::logical_expr::expr::InSubquery; use pyo3::prelude::*; -use super::{subquery::PySubquery, PyExpr}; +use super::PyExpr; +use super::subquery::PySubquery; -#[pyclass(frozen, name = "InSubquery", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "InSubquery", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyInSubquery { in_subquery: InSubquery, diff --git a/src/expr/indexed_field.rs b/crates/core/src/expr/indexed_field.rs similarity index 94% rename from src/expr/indexed_field.rs rename to crates/core/src/expr/indexed_field.rs index 1dfa0ed2f..98a90d8d4 100644 --- a/src/expr/indexed_field.rs +++ b/crates/core/src/expr/indexed_field.rs @@ -15,14 +15,21 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; +use std::fmt::{Display, Formatter}; + use datafusion::logical_expr::expr::{GetFieldAccess, GetIndexedField}; use pyo3::prelude::*; -use std::fmt::{Display, Formatter}; use super::literal::PyLiteral; +use crate::expr::PyExpr; -#[pyclass(frozen, name = "GetIndexedField", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "GetIndexedField", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyGetIndexedField { indexed_field: GetIndexedField, diff --git a/src/expr/join.rs b/crates/core/src/expr/join.rs similarity index 92% rename from src/expr/join.rs rename to crates/core/src/expr/join.rs index 3fde874d5..b90f2f57d 100644 --- a/src/expr/join.rs +++ b/crates/core/src/expr/join.rs @@ -15,17 +15,20 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; + use datafusion::common::NullEquality; use datafusion::logical_expr::logical_plan::{Join, JoinConstraint, JoinType}; -use pyo3::{prelude::*, IntoPyObjectExt}; -use std::fmt::{self, Display, Formatter}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use crate::common::df_schema::PyDFSchema; -use crate::expr::{logical_node::LogicalNode, PyExpr}; +use crate::expr::PyExpr; +use crate::expr::logical_node::LogicalNode; use crate::sql::logical::PyLogicalPlan; #[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[pyclass(frozen, name = "JoinType", module = "datafusion.expr")] +#[pyclass(from_py_object, frozen, name = "JoinType", module = "datafusion.expr")] pub struct PyJoinType { join_type: JoinType, } @@ -60,7 +63,12 @@ impl Display for PyJoinType { } #[derive(Debug, Clone, Copy)] -#[pyclass(frozen, name = "JoinConstraint", module = "datafusion.expr")] +#[pyclass( + from_py_object, + frozen, + name = "JoinConstraint", + module = "datafusion.expr" +)] pub struct PyJoinConstraint { join_constraint: JoinConstraint, } @@ -87,7 +95,13 @@ impl PyJoinConstraint { } } -#[pyclass(frozen, name = "Join", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Join", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyJoin { join: Join, diff --git a/src/expr/like.rs b/crates/core/src/expr/like.rs similarity index 92% rename from src/expr/like.rs rename to crates/core/src/expr/like.rs index 0a36dcd92..417dc9182 100644 --- a/src/expr/like.rs +++ b/crates/core/src/expr/like.rs @@ -15,13 +15,20 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; + use datafusion::logical_expr::expr::Like; use pyo3::prelude::*; -use std::fmt::{self, Display, Formatter}; use crate::expr::PyExpr; -#[pyclass(frozen, name = "Like", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Like", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyLike { like: Like, @@ -79,7 +86,13 @@ impl PyLike { } } -#[pyclass(frozen, name = "ILike", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "ILike", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyILike { like: Like, @@ -137,7 +150,13 @@ impl PyILike { } } -#[pyclass(frozen, name = "SimilarTo", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "SimilarTo", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PySimilarTo { like: Like, diff --git a/src/expr/limit.rs b/crates/core/src/expr/limit.rs similarity index 94% rename from src/expr/limit.rs rename to crates/core/src/expr/limit.rs index cf6971fb3..c04b8bfa8 100644 --- a/src/expr/limit.rs +++ b/crates/core/src/expr/limit.rs @@ -15,15 +15,23 @@ // specific language governing permissions and limitations // under the License. -use datafusion::logical_expr::logical_plan::Limit; -use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; +use datafusion::logical_expr::logical_plan::Limit; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use crate::common::df_schema::PyDFSchema; use crate::expr::logical_node::LogicalNode; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Limit", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Limit", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyLimit { limit: Limit, diff --git a/src/expr/literal.rs b/crates/core/src/expr/literal.rs similarity index 95% rename from src/expr/literal.rs rename to crates/core/src/expr/literal.rs index 8a589b55a..9db0f594b 100644 --- a/src/expr/literal.rs +++ b/crates/core/src/expr/literal.rs @@ -15,11 +15,20 @@ // specific language governing permissions and limitations // under the License. +use datafusion::common::ScalarValue; +use datafusion::logical_expr::expr::FieldMetadata; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use crate::errors::PyDataFusionError; -use datafusion::{common::ScalarValue, logical_expr::expr::FieldMetadata}; -use pyo3::{prelude::*, IntoPyObjectExt}; -#[pyclass(name = "Literal", module = "datafusion.expr", subclass, frozen)] +#[pyclass( + from_py_object, + name = "Literal", + module = "datafusion.expr", + subclass, + frozen +)] #[derive(Clone)] pub struct PyLiteral { pub value: ScalarValue, diff --git a/src/expr/logical_node.rs b/crates/core/src/expr/logical_node.rs similarity index 100% rename from src/expr/logical_node.rs rename to crates/core/src/expr/logical_node.rs diff --git a/src/expr/placeholder.rs b/crates/core/src/expr/placeholder.rs similarity index 76% rename from src/expr/placeholder.rs rename to crates/core/src/expr/placeholder.rs index 268263d41..6bd88321c 100644 --- a/src/expr/placeholder.rs +++ b/crates/core/src/expr/placeholder.rs @@ -15,12 +15,20 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::Field; +use arrow::pyarrow::PyArrowType; use datafusion::logical_expr::expr::Placeholder; use pyo3::prelude::*; use crate::common::data_type::PyDataType; -#[pyclass(frozen, name = "Placeholder", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Placeholder", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyPlaceholder { placeholder: Placeholder, @@ -40,8 +48,15 @@ impl PyPlaceholder { fn data_type(&self) -> Option { self.placeholder - .data_type + .field .as_ref() - .map(|e| e.clone().into()) + .map(|f| f.data_type().clone().into()) + } + + fn field(&self) -> Option> { + self.placeholder + .field + .as_ref() + .map(|f| f.as_ref().clone().into()) } } diff --git a/src/expr/projection.rs b/crates/core/src/expr/projection.rs similarity index 95% rename from src/expr/projection.rs rename to crates/core/src/expr/projection.rs index b2d5db79b..456e06412 100644 --- a/src/expr/projection.rs +++ b/crates/core/src/expr/projection.rs @@ -15,17 +15,25 @@ // specific language governing permissions and limitations // under the License. -use datafusion::logical_expr::logical_plan::Projection; -use datafusion::logical_expr::Expr; -use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; +use datafusion::logical_expr::Expr; +use datafusion::logical_expr::logical_plan::Projection; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use crate::common::df_schema::PyDFSchema; -use crate::expr::logical_node::LogicalNode; use crate::expr::PyExpr; +use crate::expr::logical_node::LogicalNode; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Projection", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Projection", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyProjection { pub projection: Projection, diff --git a/src/expr/recursive_query.rs b/crates/core/src/expr/recursive_query.rs similarity index 94% rename from src/expr/recursive_query.rs rename to crates/core/src/expr/recursive_query.rs index fe047315e..e03137b80 100644 --- a/src/expr/recursive_query.rs +++ b/crates/core/src/expr/recursive_query.rs @@ -18,13 +18,19 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::RecursiveQuery; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::sql::logical::PyLogicalPlan; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "RecursiveQuery", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "RecursiveQuery", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyRecursiveQuery { query: RecursiveQuery, diff --git a/src/expr/repartition.rs b/crates/core/src/expr/repartition.rs similarity index 87% rename from src/expr/repartition.rs rename to crates/core/src/expr/repartition.rs index ee6d1dc45..be39b9978 100644 --- a/src/expr/repartition.rs +++ b/crates/core/src/expr/repartition.rs @@ -17,20 +17,35 @@ use std::fmt::{self, Display, Formatter}; -use datafusion::logical_expr::{logical_plan::Repartition, Expr, Partitioning}; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{errors::py_type_err, sql::logical::PyLogicalPlan}; - -use super::{logical_node::LogicalNode, PyExpr}; - -#[pyclass(frozen, name = "Repartition", module = "datafusion.expr", subclass)] +use datafusion::logical_expr::logical_plan::Repartition; +use datafusion::logical_expr::{Expr, Partitioning}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + +use super::PyExpr; +use super::logical_node::LogicalNode; +use crate::errors::py_type_err; +use crate::sql::logical::PyLogicalPlan; + +#[pyclass( + from_py_object, + frozen, + name = "Repartition", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyRepartition { repartition: Repartition, } -#[pyclass(frozen, name = "Partitioning", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Partitioning", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyPartitioning { partitioning: Partitioning, diff --git a/src/expr/scalar_subquery.rs b/crates/core/src/expr/scalar_subquery.rs similarity index 91% rename from src/expr/scalar_subquery.rs rename to crates/core/src/expr/scalar_subquery.rs index e58d66e19..c7852a4c4 100644 --- a/src/expr/scalar_subquery.rs +++ b/crates/core/src/expr/scalar_subquery.rs @@ -20,7 +20,13 @@ use pyo3::prelude::*; use super::subquery::PySubquery; -#[pyclass(frozen, name = "ScalarSubquery", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "ScalarSubquery", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyScalarSubquery { subquery: Subquery, diff --git a/src/expr/scalar_variable.rs b/crates/core/src/expr/scalar_variable.rs similarity index 76% rename from src/expr/scalar_variable.rs rename to crates/core/src/expr/scalar_variable.rs index f3c128a4c..2d3bc4b76 100644 --- a/src/expr/scalar_variable.rs +++ b/crates/core/src/expr/scalar_variable.rs @@ -15,22 +15,28 @@ // specific language governing permissions and limitations // under the License. -use datafusion::arrow::datatypes::DataType; +use arrow::datatypes::FieldRef; use pyo3::prelude::*; use crate::common::data_type::PyDataType; -#[pyclass(frozen, name = "ScalarVariable", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "ScalarVariable", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyScalarVariable { - data_type: DataType, + field: FieldRef, variables: Vec, } impl PyScalarVariable { - pub fn new(data_type: &DataType, variables: &[String]) -> Self { + pub fn new(field: &FieldRef, variables: &[String]) -> Self { Self { - data_type: data_type.to_owned(), + field: field.to_owned(), variables: variables.to_vec(), } } @@ -40,7 +46,7 @@ impl PyScalarVariable { impl PyScalarVariable { /// Get the data type fn data_type(&self) -> PyResult { - Ok(self.data_type.clone().into()) + Ok(self.field.data_type().clone().into()) } fn variables(&self) -> PyResult> { @@ -48,6 +54,6 @@ impl PyScalarVariable { } fn __repr__(&self) -> PyResult { - Ok(format!("{}{:?}", self.data_type, self.variables)) + Ok(format!("{}{:?}", self.field.data_type(), self.variables)) } } diff --git a/crates/core/src/expr/set_comparison.rs b/crates/core/src/expr/set_comparison.rs new file mode 100644 index 000000000..9f0c077e1 --- /dev/null +++ b/crates/core/src/expr/set_comparison.rs @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::logical_expr::expr::SetComparison; +use pyo3::prelude::*; + +use super::subquery::PySubquery; +use crate::expr::PyExpr; + +#[pyclass( + from_py_object, + frozen, + name = "SetComparison", + module = "datafusion.set_comparison", + subclass +)] +#[derive(Clone)] +pub struct PySetComparison { + set_comparison: SetComparison, +} + +impl From for PySetComparison { + fn from(set_comparison: SetComparison) -> Self { + PySetComparison { set_comparison } + } +} + +#[pymethods] +impl PySetComparison { + fn expr(&self) -> PyExpr { + (*self.set_comparison.expr).clone().into() + } + + fn subquery(&self) -> PySubquery { + self.set_comparison.subquery.clone().into() + } + + fn op(&self) -> String { + format!("{}", self.set_comparison.op) + } + + fn quantifier(&self) -> String { + format!("{}", self.set_comparison.quantifier) + } +} diff --git a/src/expr/signature.rs b/crates/core/src/expr/signature.rs similarity index 91% rename from src/expr/signature.rs rename to crates/core/src/expr/signature.rs index e2c23dce9..35268e3a9 100644 --- a/src/expr/signature.rs +++ b/crates/core/src/expr/signature.rs @@ -19,7 +19,13 @@ use datafusion::logical_expr::{TypeSignature, Volatility}; use pyo3::prelude::*; #[allow(dead_code)] -#[pyclass(frozen, name = "Signature", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Signature", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PySignature { type_signature: TypeSignature, diff --git a/src/expr/sort.rs b/crates/core/src/expr/sort.rs similarity index 94% rename from src/expr/sort.rs rename to crates/core/src/expr/sort.rs index d5ea07fdd..7c1e654c5 100644 --- a/src/expr/sort.rs +++ b/crates/core/src/expr/sort.rs @@ -15,17 +15,25 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; + use datafusion::common::DataFusionError; use datafusion::logical_expr::logical_plan::Sort; -use pyo3::{prelude::*, IntoPyObjectExt}; -use std::fmt::{self, Display, Formatter}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use crate::common::df_schema::PyDFSchema; use crate::expr::logical_node::LogicalNode; use crate::expr::sort_expr::PySortExpr; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Sort", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Sort", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PySort { sort: Sort, diff --git a/src/expr/sort_expr.rs b/crates/core/src/expr/sort_expr.rs similarity index 95% rename from src/expr/sort_expr.rs rename to crates/core/src/expr/sort_expr.rs index 3f279027e..3c3c86bc1 100644 --- a/src/expr/sort_expr.rs +++ b/crates/core/src/expr/sort_expr.rs @@ -15,12 +15,20 @@ // specific language governing permissions and limitations // under the License. -use crate::expr::PyExpr; +use std::fmt::{self, Display, Formatter}; + use datafusion::logical_expr::SortExpr; use pyo3::prelude::*; -use std::fmt::{self, Display, Formatter}; -#[pyclass(frozen, name = "SortExpr", module = "datafusion.expr", subclass)] +use crate::expr::PyExpr; + +#[pyclass( + from_py_object, + frozen, + name = "SortExpr", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PySortExpr { pub(crate) sort: SortExpr, diff --git a/src/expr/statement.rs b/crates/core/src/expr/statement.rs similarity index 82% rename from src/expr/statement.rs rename to crates/core/src/expr/statement.rs index 1ea4f9f7f..5aa1e4e9c 100644 --- a/src/expr/statement.rs +++ b/crates/core/src/expr/statement.rs @@ -15,17 +15,23 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + +use arrow::datatypes::Field; +use arrow::pyarrow::PyArrowType; use datafusion::logical_expr::{ - Deallocate, Execute, Prepare, SetVariable, TransactionAccessMode, TransactionConclusion, - TransactionEnd, TransactionIsolationLevel, TransactionStart, + Deallocate, Execute, Prepare, ResetVariable, SetVariable, TransactionAccessMode, + TransactionConclusion, TransactionEnd, TransactionIsolationLevel, TransactionStart, }; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{common::data_type::PyDataType, sql::logical::PyLogicalPlan}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; -use super::{logical_node::LogicalNode, PyExpr}; +use super::PyExpr; +use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; #[pyclass( + from_py_object, frozen, name = "TransactionStart", module = "datafusion.expr", @@ -62,6 +68,7 @@ impl LogicalNode for PyTransactionStart { #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[pyclass( + from_py_object, frozen, eq, eq_int, @@ -95,6 +102,7 @@ impl TryFrom for TransactionAccessMode { #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[pyclass( + from_py_object, frozen, eq, eq_int, @@ -173,7 +181,13 @@ impl PyTransactionStart { } } -#[pyclass(frozen, name = "TransactionEnd", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "TransactionEnd", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyTransactionEnd { transaction_end: TransactionEnd, @@ -205,6 +219,7 @@ impl LogicalNode for PyTransactionEnd { #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[pyclass( + from_py_object, frozen, eq, eq_int, @@ -254,7 +269,63 @@ impl PyTransactionEnd { } } -#[pyclass(frozen, name = "SetVariable", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "ResetVariable", + module = "datafusion.expr", + subclass +)] +#[derive(Clone)] +pub struct PyResetVariable { + reset_variable: ResetVariable, +} + +impl From for PyResetVariable { + fn from(reset_variable: ResetVariable) -> PyResetVariable { + PyResetVariable { reset_variable } + } +} + +impl TryFrom for ResetVariable { + type Error = PyErr; + + fn try_from(py: PyResetVariable) -> Result { + Ok(py.reset_variable) + } +} + +impl LogicalNode for PyResetVariable { + fn inputs(&self) -> Vec { + vec![] + } + + fn to_variant<'py>(&self, py: Python<'py>) -> PyResult> { + self.clone().into_bound_py_any(py) + } +} + +#[pymethods] +impl PyResetVariable { + #[new] + pub fn new(variable: String) -> Self { + PyResetVariable { + reset_variable: ResetVariable { variable }, + } + } + + pub fn variable(&self) -> String { + self.reset_variable.variable.clone() + } +} + +#[pyclass( + from_py_object, + frozen, + name = "SetVariable", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PySetVariable { set_variable: SetVariable, @@ -302,7 +373,13 @@ impl PySetVariable { } } -#[pyclass(frozen, name = "Prepare", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Prepare", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyPrepare { prepare: Prepare, @@ -335,16 +412,13 @@ impl LogicalNode for PyPrepare { #[pymethods] impl PyPrepare { #[new] - pub fn new(name: String, data_types: Vec, input: PyLogicalPlan) -> Self { + pub fn new(name: String, fields: Vec>, input: PyLogicalPlan) -> Self { let input = input.plan().clone(); - let data_types = data_types - .into_iter() - .map(|data_type| data_type.into()) - .collect(); + let fields = fields.into_iter().map(|field| Arc::new(field.0)).collect(); PyPrepare { prepare: Prepare { name, - data_types, + fields, input, }, } @@ -354,12 +428,12 @@ impl PyPrepare { self.prepare.name.clone() } - pub fn data_types(&self) -> Vec { + pub fn fields(&self) -> Vec> { self.prepare - .data_types + .fields .clone() .into_iter() - .map(|t| t.into()) + .map(|f| f.as_ref().clone().into()) .collect() } @@ -370,7 +444,13 @@ impl PyPrepare { } } -#[pyclass(frozen, name = "Execute", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Execute", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyExecute { execute: Execute, @@ -427,7 +507,13 @@ impl PyExecute { } } -#[pyclass(frozen, name = "Deallocate", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Deallocate", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyDeallocate { deallocate: Deallocate, diff --git a/src/expr/subquery.rs b/crates/core/src/expr/subquery.rs similarity index 93% rename from src/expr/subquery.rs rename to crates/core/src/expr/subquery.rs index 785cf7d1a..c6fa83db8 100644 --- a/src/expr/subquery.rs +++ b/crates/core/src/expr/subquery.rs @@ -18,13 +18,19 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::Subquery; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::sql::logical::PyLogicalPlan; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Subquery", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Subquery", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PySubquery { subquery: Subquery, diff --git a/src/expr/subquery_alias.rs b/crates/core/src/expr/subquery_alias.rs similarity index 91% rename from src/expr/subquery_alias.rs rename to crates/core/src/expr/subquery_alias.rs index ab1229bfe..a6b09e842 100644 --- a/src/expr/subquery_alias.rs +++ b/crates/core/src/expr/subquery_alias.rs @@ -18,13 +18,20 @@ use std::fmt::{self, Display, Formatter}; use datafusion::logical_expr::SubqueryAlias; -use pyo3::{prelude::*, IntoPyObjectExt}; - -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "SubqueryAlias", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "SubqueryAlias", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PySubqueryAlias { subquery_alias: SubqueryAlias, diff --git a/src/expr/table_scan.rs b/crates/core/src/expr/table_scan.rs similarity index 95% rename from src/expr/table_scan.rs rename to crates/core/src/expr/table_scan.rs index 34a140df3..8ba7e4a69 100644 --- a/src/expr/table_scan.rs +++ b/crates/core/src/expr/table_scan.rs @@ -15,16 +15,25 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; + use datafusion::common::TableReference; use datafusion::logical_expr::logical_plan::TableScan; -use pyo3::{prelude::*, IntoPyObjectExt}; -use std::fmt::{self, Display, Formatter}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; +use crate::common::df_schema::PyDFSchema; +use crate::expr::PyExpr; use crate::expr::logical_node::LogicalNode; use crate::sql::logical::PyLogicalPlan; -use crate::{common::df_schema::PyDFSchema, expr::PyExpr}; -#[pyclass(frozen, name = "TableScan", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "TableScan", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyTableScan { table_scan: TableScan, diff --git a/src/expr/union.rs b/crates/core/src/expr/union.rs similarity index 93% rename from src/expr/union.rs rename to crates/core/src/expr/union.rs index b7b589650..a3b9efe91 100644 --- a/src/expr/union.rs +++ b/crates/core/src/expr/union.rs @@ -15,15 +15,23 @@ // specific language governing permissions and limitations // under the License. -use datafusion::logical_expr::logical_plan::Union; -use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; +use datafusion::logical_expr::logical_plan::Union; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use crate::common::df_schema::PyDFSchema; use crate::expr::logical_node::LogicalNode; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Union", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Union", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyUnion { union_: Union, diff --git a/src/expr/unnest.rs b/crates/core/src/expr/unnest.rs similarity index 93% rename from src/expr/unnest.rs rename to crates/core/src/expr/unnest.rs index 7ed7919b1..880d0a279 100644 --- a/src/expr/unnest.rs +++ b/crates/core/src/expr/unnest.rs @@ -15,15 +15,23 @@ // specific language governing permissions and limitations // under the License. -use datafusion::logical_expr::logical_plan::Unnest; -use pyo3::{prelude::*, IntoPyObjectExt}; use std::fmt::{self, Display, Formatter}; +use datafusion::logical_expr::logical_plan::Unnest; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + use crate::common::df_schema::PyDFSchema; use crate::expr::logical_node::LogicalNode; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Unnest", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Unnest", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyUnnest { unnest_: Unnest, diff --git a/src/expr/unnest_expr.rs b/crates/core/src/expr/unnest_expr.rs similarity index 93% rename from src/expr/unnest_expr.rs rename to crates/core/src/expr/unnest_expr.rs index 2cdf46a59..97feef1d1 100644 --- a/src/expr/unnest_expr.rs +++ b/crates/core/src/expr/unnest_expr.rs @@ -15,13 +15,20 @@ // specific language governing permissions and limitations // under the License. +use std::fmt::{self, Display, Formatter}; + use datafusion::logical_expr::expr::Unnest; use pyo3::prelude::*; -use std::fmt::{self, Display, Formatter}; use super::PyExpr; -#[pyclass(frozen, name = "UnnestExpr", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "UnnestExpr", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyUnnestExpr { unnest: Unnest, diff --git a/src/expr/values.rs b/crates/core/src/expr/values.rs similarity index 86% rename from src/expr/values.rs rename to crates/core/src/expr/values.rs index 63d94ce00..d40b0e7cf 100644 --- a/src/expr/values.rs +++ b/crates/core/src/expr/values.rs @@ -18,14 +18,21 @@ use std::sync::Arc; use datafusion::logical_expr::Values; -use pyo3::{prelude::*, IntoPyObjectExt}; -use pyo3::{pyclass, PyErr, PyResult, Python}; +use pyo3::prelude::*; +use pyo3::{IntoPyObjectExt, PyErr, PyResult, Python, pyclass}; -use crate::{common::df_schema::PyDFSchema, sql::logical::PyLogicalPlan}; +use super::PyExpr; +use super::logical_node::LogicalNode; +use crate::common::df_schema::PyDFSchema; +use crate::sql::logical::PyLogicalPlan; -use super::{logical_node::LogicalNode, PyExpr}; - -#[pyclass(frozen, name = "Values", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Values", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyValues { values: Values, diff --git a/src/expr/window.rs b/crates/core/src/expr/window.rs similarity index 95% rename from src/expr/window.rs rename to crates/core/src/expr/window.rs index 2723007ec..92d909bfc 100644 --- a/src/expr/window.rs +++ b/crates/core/src/expr/window.rs @@ -15,28 +15,42 @@ // specific language governing permissions and limitations // under the License. -use crate::common::data_type::PyScalarValue; -use crate::common::df_schema::PyDFSchema; -use crate::errors::{py_type_err, PyDataFusionResult}; -use crate::expr::logical_node::LogicalNode; -use crate::expr::sort_expr::{py_sort_expr_list, PySortExpr}; -use crate::expr::PyExpr; -use crate::sql::logical::PyLogicalPlan; +use std::fmt::{self, Display, Formatter}; + use datafusion::common::{DataFusionError, ScalarValue}; use datafusion::logical_expr::{Expr, Window, WindowFrame, WindowFrameBound, WindowFrameUnits}; +use pyo3::IntoPyObjectExt; use pyo3::exceptions::PyNotImplementedError; -use pyo3::{prelude::*, IntoPyObjectExt}; -use std::fmt::{self, Display, Formatter}; +use pyo3::prelude::*; use super::py_expr_list; +use crate::common::data_type::PyScalarValue; +use crate::common::df_schema::PyDFSchema; +use crate::errors::{PyDataFusionResult, py_type_err}; +use crate::expr::PyExpr; +use crate::expr::logical_node::LogicalNode; +use crate::expr::sort_expr::{PySortExpr, py_sort_expr_list}; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "WindowExpr", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "WindowExpr", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyWindowExpr { window: Window, } -#[pyclass(frozen, name = "WindowFrame", module = "datafusion.expr", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "WindowFrame", + module = "datafusion.expr", + subclass +)] #[derive(Clone)] pub struct PyWindowFrame { window_frame: WindowFrame, @@ -55,6 +69,7 @@ impl From for PyWindowFrame { } #[pyclass( + from_py_object, frozen, name = "WindowFrameBound", module = "datafusion.expr", diff --git a/src/functions.rs b/crates/core/src/functions.rs similarity index 92% rename from src/functions.rs rename to crates/core/src/functions.rs index 5956b67cf..c32134054 100644 --- a/src/functions.rs +++ b/crates/core/src/functions.rs @@ -17,32 +17,25 @@ use std::collections::HashMap; +use datafusion::common::{Column, ScalarValue, TableReference}; +use datafusion::execution::FunctionRegistry; use datafusion::functions_aggregate::all_default_aggregate_functions; use datafusion::functions_window::all_default_window_functions; -use datafusion::logical_expr::expr::FieldMetadata; -use datafusion::logical_expr::expr::WindowFunctionParams; -use datafusion::logical_expr::ExprFunctionExt; -use datafusion::logical_expr::WindowFrame; -use pyo3::{prelude::*, wrap_pyfunction}; - -use crate::common::data_type::NullTreatment; -use crate::common::data_type::PyScalarValue; +use datafusion::logical_expr::expr::{ + Alias, FieldMetadata, NullTreatment as DFNullTreatment, WindowFunction, WindowFunctionParams, +}; +use datafusion::logical_expr::{Expr, ExprFunctionExt, WindowFrame, WindowFunctionDefinition, lit}; +use datafusion::{functions, functions_aggregate, functions_window}; +use pyo3::prelude::*; +use pyo3::wrap_pyfunction; + +use crate::common::data_type::{NullTreatment, PyScalarValue}; use crate::context::PySessionContext; -use crate::errors::PyDataFusionError; -use crate::errors::PyDataFusionResult; +use crate::errors::{PyDataFusionError, PyDataFusionResult}; +use crate::expr::PyExpr; use crate::expr::conditional_expr::PyCaseBuilder; -use crate::expr::sort_expr::to_sort_expressions; -use crate::expr::sort_expr::PySortExpr; +use crate::expr::sort_expr::{PySortExpr, to_sort_expressions}; use crate::expr::window::PyWindowFrame; -use crate::expr::PyExpr; -use datafusion::common::{Column, ScalarValue, TableReference}; -use datafusion::execution::FunctionRegistry; -use datafusion::functions; -use datafusion::functions_aggregate; -use datafusion::functions_window; -use datafusion::logical_expr::expr::Alias; -use datafusion::logical_expr::sqlparser::ast::NullTreatment as DFNullTreatment; -use datafusion::logical_expr::{expr::WindowFunction, lit, Expr, WindowFunctionDefinition}; fn add_builder_fns_to_aggregate( agg_fn: Expr, @@ -196,6 +189,29 @@ fn regexp_count( .into()) } +#[pyfunction] +#[pyo3(signature = (values, regex, start=None, n=None, flags=None, subexpr=None))] +/// Returns the position in a string where the specified occurrence of a regular expression is located +fn regexp_instr( + values: PyExpr, + regex: PyExpr, + start: Option, + n: Option, + flags: Option, + subexpr: Option, +) -> PyResult { + Ok(functions::expr_fn::regexp_instr( + values.into(), + regex.into(), + start.map(|x| x.expr).or(Some(lit(1))), + n.map(|x| x.expr).or(Some(lit(1))), + None, + flags.map(|x| x.expr).or(Some(lit(""))), + subexpr.map(|x| x.expr).or(Some(lit(0))), + ) + .into()) +} + /// Creates a new Sort Expr #[pyfunction] fn order_by(expr: PyExpr, asc: bool, nulls_first: bool) -> PyResult { @@ -448,7 +464,11 @@ macro_rules! array_fn { expr_fn!(abs, num); expr_fn!(acos, num); expr_fn!(acosh, num); -expr_fn!(ascii, arg1, "Returns the numeric code of the first character of the argument. In UTF8 encoding, returns the Unicode code point of the character. In other multibyte encodings, the argument must be an ASCII character."); +expr_fn!( + ascii, + arg1, + "Returns the numeric code of the first character of the argument. In UTF8 encoding, returns the Unicode code point of the character. In other multibyte encodings, the argument must be an ASCII character." +); expr_fn!(asin, num); expr_fn!(asinh, num); expr_fn!(atan, num); @@ -459,7 +479,10 @@ expr_fn!( arg, "Returns number of bits in the string (8 times the octet_length)." ); -expr_fn_vec!(btrim, "Removes the longest string containing only characters in characters (a space by default) from the start and end of string."); +expr_fn_vec!( + btrim, + "Removes the longest string containing only characters in characters (a space by default) from the start and end of string." +); expr_fn!(cbrt, num); expr_fn!(ceil, num); expr_fn!( @@ -482,7 +505,11 @@ expr_fn!(exp, num); expr_fn!(factorial, num); expr_fn!(floor, num); expr_fn!(gcd, x y); -expr_fn!(initcap, string, "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters."); +expr_fn!( + initcap, + string, + "Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters." +); expr_fn!(isnan, num); expr_fn!(iszero, num); expr_fn!(levenshtein, string1 string2); @@ -493,8 +520,14 @@ expr_fn!(log, base num); expr_fn!(log10, num); expr_fn!(log2, num); expr_fn!(lower, arg1, "Converts the string to all lower case"); -expr_fn_vec!(lpad, "Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right)."); -expr_fn_vec!(ltrim, "Removes the longest string containing only characters in characters (a space by default) from the start of string."); +expr_fn_vec!( + lpad, + "Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right)." +); +expr_fn_vec!( + ltrim, + "Removes the longest string containing only characters in characters (a space by default) from the start of string." +); expr_fn!( md5, input_arg, @@ -511,7 +544,11 @@ expr_fn!( "Returns x if x is not NULL otherwise returns y." ); expr_fn!(nullif, arg_1 arg_2); -expr_fn!(octet_length, args, "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces."); +expr_fn!( + octet_length, + args, + "Returns number of bytes in the string. Since this version of the function accepts type character directly, it will not strip trailing spaces." +); expr_fn_vec!(overlay); expr_fn!(pi); expr_fn!(power, base exponent); @@ -529,8 +566,14 @@ expr_fn!( ); expr_fn!(right, string n, "Returns last n characters in the string, or when n is negative, returns all but first |n| characters."); expr_fn_vec!(round); -expr_fn_vec!(rpad, "Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated."); -expr_fn_vec!(rtrim, "Removes the longest string containing only characters in characters (a space by default) from the end of string."); +expr_fn_vec!( + rpad, + "Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated." +); +expr_fn_vec!( + rtrim, + "Removes the longest string containing only characters in characters (a space by default) from the end of string." +); expr_fn!(sha224, input_arg1); expr_fn!(sha256, input_arg1); expr_fn!(sha384, input_arg1); @@ -558,6 +601,9 @@ expr_fn!( "Converts the number to its equivalent hexadecimal representation." ); expr_fn!(now); +expr_fn_vec!(to_date); +expr_fn_vec!(to_local_time); +expr_fn_vec!(to_time); expr_fn_vec!(to_timestamp); expr_fn_vec!(to_timestamp_millis); expr_fn_vec!(to_timestamp_nanos); @@ -570,9 +616,13 @@ expr_fn!(date_part, part date); expr_fn!(date_trunc, part date); expr_fn!(date_bin, stride source origin); expr_fn!(make_date, year month day); +expr_fn!(to_char, datetime format); expr_fn!(translate, string from to, "Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted."); -expr_fn_vec!(trim, "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string."); +expr_fn_vec!( + trim, + "Removes the longest string containing only characters in characters (a space by default) from the start, end, or both ends (BOTH is the default) of string." +); expr_fn_vec!(trunc); expr_fn!(upper, arg1, "Converts the string to all upper case."); expr_fn!(uuid); @@ -965,6 +1015,7 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(radians))?; m.add_wrapped(wrap_pyfunction!(random))?; m.add_wrapped(wrap_pyfunction!(regexp_count))?; + m.add_wrapped(wrap_pyfunction!(regexp_instr))?; m.add_wrapped(wrap_pyfunction!(regexp_like))?; m.add_wrapped(wrap_pyfunction!(regexp_match))?; m.add_wrapped(wrap_pyfunction!(regexp_replace))?; @@ -998,6 +1049,10 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(tan))?; m.add_wrapped(wrap_pyfunction!(tanh))?; m.add_wrapped(wrap_pyfunction!(to_hex))?; + m.add_wrapped(wrap_pyfunction!(to_char))?; + m.add_wrapped(wrap_pyfunction!(to_date))?; + m.add_wrapped(wrap_pyfunction!(to_local_time))?; + m.add_wrapped(wrap_pyfunction!(to_time))?; m.add_wrapped(wrap_pyfunction!(to_timestamp))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_millis))?; m.add_wrapped(wrap_pyfunction!(to_timestamp_nanos))?; diff --git a/src/lib.rs b/crates/core/src/lib.rs similarity index 92% rename from src/lib.rs rename to crates/core/src/lib.rs index 4f816d887..fc2d006d3 100644 --- a/src/lib.rs +++ b/crates/core/src/lib.rs @@ -15,19 +15,16 @@ // specific language governing permissions and limitations // under the License. -#[cfg(feature = "mimalloc")] -use mimalloc::MiMalloc; -use pyo3::prelude::*; - // Re-export Apache Arrow DataFusion dependencies -pub use datafusion; -pub use datafusion::common as datafusion_common; -pub use datafusion::logical_expr as datafusion_expr; -pub use datafusion::optimizer; -pub use datafusion::sql as datafusion_sql; - +pub use datafusion::{ + self, common as datafusion_common, logical_expr as datafusion_expr, optimizer, + sql as datafusion_sql, +}; #[cfg(feature = "substrait")] pub use datafusion_substrait; +#[cfg(feature = "mimalloc")] +use mimalloc::MiMalloc; +use pyo3::prelude::*; #[allow(clippy::borrow_deref_ref)] pub mod catalog; @@ -46,6 +43,7 @@ pub mod errors; pub mod expr; #[allow(clippy::borrow_deref_ref)] mod functions; +mod options; pub mod physical_plan; mod pyarrow_filter_expression; pub mod pyarrow_util; @@ -55,6 +53,7 @@ pub mod store; pub mod table; pub mod unparser; +mod array; #[cfg(feature = "substrait")] pub mod substrait; #[allow(clippy::borrow_deref_ref)] @@ -63,15 +62,11 @@ mod udaf; mod udf; pub mod udtf; mod udwf; -pub mod utils; #[cfg(feature = "mimalloc")] #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; -// Used to define Tokio Runtime as a Python module attribute -pub(crate) struct TokioRuntime(tokio::runtime::Runtime); - /// Low-level DataFusion internal package. /// /// The higher-level public API is defined in pure python files under the @@ -128,6 +123,10 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> { store::init_module(&store)?; m.add_submodule(&store)?; + let options = PyModule::new(py, "options")?; + options::init_module(&options)?; + m.add_submodule(&options)?; + // Register substrait as a submodule #[cfg(feature = "substrait")] setup_substrait_module(py, &m)?; diff --git a/crates/core/src/options.rs b/crates/core/src/options.rs new file mode 100644 index 000000000..6b6037695 --- /dev/null +++ b/crates/core/src/options.rs @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Schema}; +use arrow::pyarrow::PyArrowType; +use datafusion::prelude::CsvReadOptions; +use pyo3::prelude::{PyModule, PyModuleMethods}; +use pyo3::{Bound, PyResult, pyclass, pymethods}; + +use crate::context::parse_file_compression_type; +use crate::errors::PyDataFusionError; +use crate::expr::sort_expr::PySortExpr; + +/// Options for reading CSV files +#[pyclass(name = "CsvReadOptions", module = "datafusion.options", frozen)] +pub struct PyCsvReadOptions { + pub has_header: bool, + pub delimiter: u8, + pub quote: u8, + pub terminator: Option, + pub escape: Option, + pub comment: Option, + pub newlines_in_values: bool, + pub schema: Option>, + pub schema_infer_max_records: usize, + pub file_extension: String, + pub table_partition_cols: Vec<(String, PyArrowType)>, + pub file_compression_type: String, + pub file_sort_order: Vec>, + pub null_regex: Option, + pub truncated_rows: bool, +} + +#[pymethods] +impl PyCsvReadOptions { + #[allow(clippy::too_many_arguments)] + #[pyo3(signature = ( + has_header=true, + delimiter=b',', + quote=b'"', + terminator=None, + escape=None, + comment=None, + newlines_in_values=false, + schema=None, + schema_infer_max_records=1000, + file_extension=".csv".to_string(), + table_partition_cols=vec![], + file_compression_type="".to_string(), + file_sort_order=vec![], + null_regex=None, + truncated_rows=false + ))] + #[new] + fn new( + has_header: bool, + delimiter: u8, + quote: u8, + terminator: Option, + escape: Option, + comment: Option, + newlines_in_values: bool, + schema: Option>, + schema_infer_max_records: usize, + file_extension: String, + table_partition_cols: Vec<(String, PyArrowType)>, + file_compression_type: String, + file_sort_order: Vec>, + null_regex: Option, + truncated_rows: bool, + ) -> Self { + Self { + has_header, + delimiter, + quote, + terminator, + escape, + comment, + newlines_in_values, + schema, + schema_infer_max_records, + file_extension, + table_partition_cols, + file_compression_type, + file_sort_order, + null_regex, + truncated_rows, + } + } +} + +impl<'a> TryFrom<&'a PyCsvReadOptions> for CsvReadOptions<'a> { + type Error = PyDataFusionError; + + fn try_from(value: &'a PyCsvReadOptions) -> Result, Self::Error> { + let partition_cols: Vec<(String, DataType)> = value + .table_partition_cols + .iter() + .map(|(name, dtype)| (name.clone(), dtype.0.clone())) + .collect(); + + let compression = parse_file_compression_type(Some(value.file_compression_type.clone()))?; + + let sort_order: Vec> = value + .file_sort_order + .iter() + .map(|inner| { + inner + .iter() + .map(|sort_expr| sort_expr.sort.clone()) + .collect() + }) + .collect(); + + // Explicit struct initialization to catch upstream changes + let mut options = CsvReadOptions { + has_header: value.has_header, + delimiter: value.delimiter, + quote: value.quote, + terminator: value.terminator, + escape: value.escape, + comment: value.comment, + newlines_in_values: value.newlines_in_values, + schema: None, // Will be set separately due to lifetime constraints + schema_infer_max_records: value.schema_infer_max_records, + file_extension: value.file_extension.as_str(), + table_partition_cols: partition_cols, + file_compression_type: compression, + file_sort_order: sort_order, + null_regex: value.null_regex.clone(), + truncated_rows: value.truncated_rows, + }; + + // Set schema separately to handle the lifetime + options.schema = value.schema.as_ref().map(|s| &s.0); + + Ok(options) + } +} + +pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + + Ok(()) +} diff --git a/src/physical_plan.rs b/crates/core/src/physical_plan.rs similarity index 85% rename from src/physical_plan.rs rename to crates/core/src/physical_plan.rs index 4994b0114..8674a8b55 100644 --- a/src/physical_plan.rs +++ b/crates/core/src/physical_plan.rs @@ -15,16 +15,25 @@ // specific language governing permissions and limitations // under the License. -use datafusion::physical_plan::{displayable, ExecutionPlan, ExecutionPlanProperties}; -use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; -use prost::Message; use std::sync::Arc; -use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; - -use crate::{context::PySessionContext, errors::PyDataFusionResult}; - -#[pyclass(frozen, name = "ExecutionPlan", module = "datafusion", subclass)] +use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable}; +use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; +use prost::Message; +use pyo3::exceptions::PyRuntimeError; +use pyo3::prelude::*; +use pyo3::types::PyBytes; + +use crate::context::PySessionContext; +use crate::errors::PyDataFusionResult; + +#[pyclass( + from_py_object, + frozen, + name = "ExecutionPlan", + module = "datafusion", + subclass +)] #[derive(Debug, Clone)] pub struct PyExecutionPlan { pub plan: Arc, @@ -74,7 +83,7 @@ impl PyExecutionPlan { ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>, ) -> PyDataFusionResult { - let bytes: &[u8] = proto_msg.extract()?; + let bytes: &[u8] = proto_msg.extract().map_err(Into::::into)?; let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( @@ -83,7 +92,7 @@ impl PyExecutionPlan { })?; let codec = DefaultPhysicalExtensionCodec {}; - let plan = proto_plan.try_into_physical_plan(&ctx.ctx, &ctx.ctx.runtime_env(), &codec)?; + let plan = proto_plan.try_into_physical_plan(ctx.ctx.task_ctx().as_ref(), &codec)?; Ok(Self::new(plan)) } diff --git a/src/pyarrow_filter_expression.rs b/crates/core/src/pyarrow_filter_expression.rs similarity index 95% rename from src/pyarrow_filter_expression.rs rename to crates/core/src/pyarrow_filter_expression.rs index 7fbb1dc2a..e3b4b6009 100644 --- a/src/pyarrow_filter_expression.rs +++ b/crates/core/src/pyarrow_filter_expression.rs @@ -15,21 +15,21 @@ // specific language governing permissions and limitations // under the License. -/// Converts a Datafusion logical plan expression (Expr) into a PyArrow compute expression -use pyo3::{prelude::*, IntoPyObjectExt}; - use std::convert::TryFrom; use std::result::Result; use datafusion::common::{Column, ScalarValue}; -use datafusion::logical_expr::{expr::InList, Between, BinaryExpr, Expr, Operator}; +use datafusion::logical_expr::expr::InList; +use datafusion::logical_expr::{Between, BinaryExpr, Expr, Operator}; +/// Converts a Datafusion logical plan expression (Expr) into a PyArrow compute expression +use pyo3::{IntoPyObjectExt, prelude::*}; use crate::errors::{PyDataFusionError, PyDataFusionResult}; use crate::pyarrow_util::scalar_to_pyarrow; #[derive(Debug)] #[repr(transparent)] -pub(crate) struct PyArrowFilterExpression(PyObject); +pub(crate) struct PyArrowFilterExpression(Py); fn operator_to_py<'py>( operator: &Operator, @@ -47,7 +47,7 @@ fn operator_to_py<'py>( _ => { return Err(PyDataFusionError::Common(format!( "Unsupported operator {operator:?}" - ))) + ))); } }; Ok(py_op) @@ -57,7 +57,7 @@ fn extract_scalar_list<'py>( exprs: &[Expr], py: Python<'py>, ) -> PyDataFusionResult>> { - let ret = exprs + exprs .iter() .map(|expr| match expr { // TODO: should we also leverage `ScalarValue::to_pyarrow` here? @@ -83,12 +83,11 @@ fn extract_scalar_list<'py>( "Only a list of Literals are supported got {expr:?}" ))), }) - .collect(); - ret + .collect() } impl PyArrowFilterExpression { - pub fn inner(&self) -> &PyObject { + pub fn inner(&self) -> &Py { &self.0 } } @@ -101,12 +100,12 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { // isin, is_null, and is_valid (~is_null) are methods of pyarrow.dataset.Expression // https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html#pyarrow-dataset-expression fn try_from(expr: &Expr) -> Result { - Python::with_gil(|py| { + Python::attach(|py| { let pc = Python::import(py, "pyarrow.compute")?; let op_module = Python::import(py, "operator")?; let pc_expr: PyDataFusionResult> = match expr { Expr::Column(Column { name, .. }) => Ok(pc.getattr("field")?.call1((name,))?), - Expr::Literal(scalar, _) => Ok(scalar_to_pyarrow(scalar, py)?.into_bound(py)), + Expr::Literal(scalar, _) => Ok(scalar_to_pyarrow(scalar, py)?), Expr::BinaryExpr(BinaryExpr { left, op, right }) => { let operator = operator_to_py(op, &op_module)?; let left = PyArrowFilterExpression::try_from(left.as_ref())?.0; diff --git a/crates/core/src/pyarrow_util.rs b/crates/core/src/pyarrow_util.rs new file mode 100644 index 000000000..1401a4938 --- /dev/null +++ b/crates/core/src/pyarrow_util.rs @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Conversions between PyArrow and DataFusion types + +use std::sync::Arc; + +use arrow::array::{Array, ArrayData, ArrayRef, ListArray, make_array}; +use arrow::buffer::OffsetBuffer; +use arrow::datatypes::Field; +use arrow::pyarrow::{FromPyArrow, ToPyArrow}; +use datafusion::common::exec_err; +use datafusion::scalar::ScalarValue; +use pyo3::types::{PyAnyMethods, PyList}; +use pyo3::{Borrowed, Bound, FromPyObject, PyAny, PyErr, PyResult, Python}; + +use crate::common::data_type::PyScalarValue; +use crate::errors::PyDataFusionError; + +/// Helper function to turn an Array into a ScalarValue. If ``as_list_array`` is true, +/// the array will be turned into a ``ListArray``. Otherwise, we extract the first value +/// from the array. +fn array_to_scalar_value(array: ArrayRef, as_list_array: bool) -> PyResult { + if as_list_array { + let field = Arc::new(Field::new_list_field( + array.data_type().clone(), + array.nulls().is_some(), + )); + let offsets = OffsetBuffer::from_lengths(vec![array.len()]); + let list_array = ListArray::new(field, offsets, array, None); + Ok(PyScalarValue(ScalarValue::List(Arc::new(list_array)))) + } else { + let scalar = ScalarValue::try_from_array(&array, 0).map_err(PyDataFusionError::from)?; + Ok(PyScalarValue(scalar)) + } +} + +/// Helper function to take any Python object that contains an Arrow PyCapsule +/// interface and attempt to extract a scalar value from it. If `as_list_array` +/// is true, the array will be turned into a ``ListArray``. Otherwise, we extract +/// the first value from the array. +fn pyobj_extract_scalar_via_capsule( + value: &Bound<'_, PyAny>, + as_list_array: bool, +) -> PyResult { + let array_data = ArrayData::from_pyarrow_bound(value)?; + let array = make_array(array_data); + + array_to_scalar_value(array, as_list_array) +} + +impl FromPyArrow for PyScalarValue { + fn from_pyarrow_bound(value: &Bound<'_, PyAny>) -> PyResult { + let py = value.py(); + let pyarrow_mod = py.import("pyarrow"); + + // Is it a PyArrow object? + if let Ok(pa) = pyarrow_mod.as_ref() { + let scalar_type = pa.getattr("Scalar")?; + if value.is_instance(&scalar_type)? { + let typ = value.getattr("type")?; + + // construct pyarrow array from the python value and pyarrow type + let factory = py.import("pyarrow")?.getattr("array")?; + let args = PyList::new(py, [value])?; + let array = factory.call1((args, typ))?; + + return pyobj_extract_scalar_via_capsule(&array, false); + } + + let array_type = pa.getattr("Array")?; + if value.is_instance(&array_type)? { + return pyobj_extract_scalar_via_capsule(value, true); + } + } + + // Is it a NanoArrow scalar? + if let Ok(na) = py.import("nanoarrow") { + let scalar_type = py.import("nanoarrow.array")?.getattr("Scalar")?; + if value.is_instance(&scalar_type)? { + return pyobj_extract_scalar_via_capsule(value, false); + } + let array_type = na.getattr("Array")?; + if value.is_instance(&array_type)? { + return pyobj_extract_scalar_via_capsule(value, true); + } + } + + // Is it a arro3 scalar? + if let Ok(arro3) = py.import("arro3").and_then(|arro3| arro3.getattr("core")) { + let scalar_type = arro3.getattr("Scalar")?; + if value.is_instance(&scalar_type)? { + return pyobj_extract_scalar_via_capsule(value, false); + } + let array_type = arro3.getattr("Array")?; + if value.is_instance(&array_type)? { + return pyobj_extract_scalar_via_capsule(value, true); + } + } + + // Does it have a PyCapsule interface but isn't one of our known libraries? + // If so do our "best guess". Try checking type name, and if that fails + // return a single value if the length is 1 and return a List value otherwise + if value.hasattr("__arrow_c_array__")? { + let type_name = value.get_type().repr()?; + if type_name.contains("Scalar")? { + return pyobj_extract_scalar_via_capsule(value, false); + } + if type_name.contains("Array")? { + return pyobj_extract_scalar_via_capsule(value, true); + } + + let array_data = ArrayData::from_pyarrow_bound(value)?; + let array = make_array(array_data); + + let as_array_list = array.len() != 1; + return array_to_scalar_value(array, as_array_list); + } + + // Last attempt - try to create a PyArrow scalar from a plain Python object + if let Ok(pa) = pyarrow_mod.as_ref() { + let scalar = pa.call_method1("scalar", (value,))?; + + PyScalarValue::from_pyarrow_bound(&scalar) + } else { + exec_err!("Unable to import scalar value").map_err(PyDataFusionError::from)? + } + } +} + +impl<'source> FromPyObject<'_, 'source> for PyScalarValue { + type Error = PyErr; + + fn extract(value: Borrowed<'_, 'source, PyAny>) -> Result { + Self::from_pyarrow_bound(&value) + } +} + +pub fn scalar_to_pyarrow<'py>( + scalar: &ScalarValue, + py: Python<'py>, +) -> PyResult> { + let array = scalar.to_array().map_err(PyDataFusionError::from)?; + // convert to pyarrow array using C data interface + let pyarray = array.to_data().to_pyarrow(py)?; + let pyscalar = pyarray.call_method1("__getitem__", (0,))?; + + Ok(pyscalar) +} diff --git a/src/record_batch.rs b/crates/core/src/record_batch.rs similarity index 82% rename from src/record_batch.rs rename to crates/core/src/record_batch.rs index c3658cf4b..0492c6c76 100644 --- a/src/record_batch.rs +++ b/crates/core/src/record_batch.rs @@ -17,17 +17,18 @@ use std::sync::Arc; -use crate::errors::PyDataFusionError; -use crate::utils::wait_for_future; use datafusion::arrow::pyarrow::ToPyArrow; use datafusion::arrow::record_batch::RecordBatch; use datafusion::physical_plan::SendableRecordBatchStream; +use datafusion_python_util::wait_for_future; use futures::StreamExt; use pyo3::exceptions::{PyStopAsyncIteration, PyStopIteration}; use pyo3::prelude::*; -use pyo3::{pyclass, pymethods, PyObject, PyResult, Python}; +use pyo3::{PyAny, PyResult, Python, pyclass, pymethods}; use tokio::sync::Mutex; +use crate::errors::PyDataFusionError; + #[pyclass(name = "RecordBatch", module = "datafusion", subclass, frozen)] pub struct PyRecordBatch { batch: RecordBatch, @@ -35,7 +36,7 @@ pub struct PyRecordBatch { #[pymethods] impl PyRecordBatch { - fn to_pyarrow(&self, py: Python) -> PyResult { + fn to_pyarrow<'py>(&self, py: Python<'py>) -> PyResult> { self.batch.to_pyarrow(py) } } @@ -84,15 +85,21 @@ impl PyRecordBatchStream { } } +/// Polls the next batch from a `SendableRecordBatchStream`, converting the `Option>` form. +pub(crate) async fn poll_next_batch( + stream: &mut SendableRecordBatchStream, +) -> datafusion::error::Result> { + stream.next().await.transpose() +} + async fn next_stream( stream: Arc>, sync: bool, ) -> PyResult { let mut stream = stream.lock().await; - match stream.next().await { - Some(Ok(batch)) => Ok(batch.into()), - Some(Err(e)) => Err(PyDataFusionError::from(e))?, - None => { + match poll_next_batch(&mut stream).await { + Ok(Some(batch)) => Ok(batch.into()), + Ok(None) => { // Depending on whether the iteration is sync or not, we raise either a // StopIteration or a StopAsyncIteration if sync { @@ -101,5 +108,6 @@ async fn next_stream( Err(PyStopAsyncIteration::new_err("stream exhausted")) } } + Err(e) => Err(PyDataFusionError::from(e))?, } } diff --git a/src/sql.rs b/crates/core/src/sql.rs similarity index 97% rename from src/sql.rs rename to crates/core/src/sql.rs index 9f1fe81be..dea9b566a 100644 --- a/src/sql.rs +++ b/crates/core/src/sql.rs @@ -17,3 +17,4 @@ pub mod exceptions; pub mod logical; +pub(crate) mod util; diff --git a/src/sql/exceptions.rs b/crates/core/src/sql/exceptions.rs similarity index 100% rename from src/sql/exceptions.rs rename to crates/core/src/sql/exceptions.rs diff --git a/src/sql/logical.rs b/crates/core/src/sql/logical.rs similarity index 93% rename from src/sql/logical.rs rename to crates/core/src/sql/logical.rs index 47ea39fdc..631aa9b09 100644 --- a/src/sql/logical.rs +++ b/crates/core/src/sql/logical.rs @@ -17,6 +17,13 @@ use std::sync::Arc; +use datafusion::logical_expr::{DdlStatement, LogicalPlan, Statement}; +use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; +use prost::Message; +use pyo3::exceptions::PyRuntimeError; +use pyo3::prelude::*; +use pyo3::types::PyBytes; + use crate::context::PySessionContext; use crate::errors::PyDataFusionResult; use crate::expr::aggregate::PyAggregate; @@ -42,12 +49,14 @@ use crate::expr::extension::PyExtension; use crate::expr::filter::PyFilter; use crate::expr::join::PyJoin; use crate::expr::limit::PyLimit; +use crate::expr::logical_node::LogicalNode; use crate::expr::projection::PyProjection; use crate::expr::recursive_query::PyRecursiveQuery; use crate::expr::repartition::PyRepartition; use crate::expr::sort::PySort; use crate::expr::statement::{ - PyDeallocate, PyExecute, PyPrepare, PySetVariable, PyTransactionEnd, PyTransactionStart, + PyDeallocate, PyExecute, PyPrepare, PyResetVariable, PySetVariable, PyTransactionEnd, + PyTransactionStart, }; use crate::expr::subquery::PySubquery; use crate::expr::subquery_alias::PySubqueryAlias; @@ -56,15 +65,16 @@ use crate::expr::union::PyUnion; use crate::expr::unnest::PyUnnest; use crate::expr::values::PyValues; use crate::expr::window::PyWindowExpr; -use datafusion::logical_expr::{DdlStatement, LogicalPlan, Statement}; -use datafusion_proto::logical_plan::{AsLogicalPlan, DefaultLogicalExtensionCodec}; -use prost::Message; -use pyo3::{exceptions::PyRuntimeError, prelude::*, types::PyBytes}; -use crate::expr::logical_node::LogicalNode; - -#[pyclass(frozen, name = "LogicalPlan", module = "datafusion", subclass)] -#[derive(Debug, Clone)] +#[pyclass( + from_py_object, + frozen, + name = "LogicalPlan", + module = "datafusion", + subclass, + eq +)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct PyLogicalPlan { pub(crate) plan: Arc, } @@ -113,6 +123,9 @@ impl PyLogicalPlan { PyTransactionEnd::from(plan.clone()).to_variant(py) } Statement::SetVariable(plan) => PySetVariable::from(plan.clone()).to_variant(py), + Statement::ResetVariable(plan) => { + PyResetVariable::from(plan.clone()).to_variant(py) + } Statement::Prepare(plan) => PyPrepare::from(plan.clone()).to_variant(py), Statement::Execute(plan) => PyExecute::from(plan.clone()).to_variant(py), Statement::Deallocate(plan) => PyDeallocate::from(plan.clone()).to_variant(py), @@ -197,7 +210,7 @@ impl PyLogicalPlan { ctx: PySessionContext, proto_msg: Bound<'_, PyBytes>, ) -> PyDataFusionResult { - let bytes: &[u8] = proto_msg.extract()?; + let bytes: &[u8] = proto_msg.extract().map_err(Into::::into)?; let proto_plan = datafusion_proto::protobuf::LogicalPlanNode::decode(bytes).map_err(|e| { PyRuntimeError::new_err(format!( @@ -206,7 +219,7 @@ impl PyLogicalPlan { })?; let codec = DefaultLogicalExtensionCodec {}; - let plan = proto_plan.try_into_logical_plan(&ctx.ctx, &codec)?; + let plan = proto_plan.try_into_logical_plan(&ctx.ctx.task_ctx(), &codec)?; Ok(Self::new(plan)) } } diff --git a/crates/core/src/sql/util.rs b/crates/core/src/sql/util.rs new file mode 100644 index 000000000..d1e8964f8 --- /dev/null +++ b/crates/core/src/sql/util.rs @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use datafusion::common::{DataFusionError, exec_err, plan_datafusion_err}; +use datafusion::logical_expr::sqlparser::dialect::dialect_from_str; +use datafusion::sql::sqlparser::dialect::Dialect; +use datafusion::sql::sqlparser::parser::Parser; +use datafusion::sql::sqlparser::tokenizer::{Token, Tokenizer}; + +fn tokens_from_replacements( + placeholder: &str, + replacements: &HashMap>, +) -> Option> { + if let Some(pattern) = placeholder.strip_prefix("$") { + replacements.get(pattern).cloned() + } else { + None + } +} + +fn get_tokens_for_string_replacement( + dialect: &dyn Dialect, + replacements: HashMap, +) -> Result>, DataFusionError> { + replacements + .into_iter() + .map(|(name, value)| { + let tokens = Tokenizer::new(dialect, &value) + .tokenize() + .map_err(|err| DataFusionError::External(err.into()))?; + Ok((name, tokens)) + }) + .collect() +} + +pub(crate) fn replace_placeholders_with_strings( + query: &str, + dialect: &str, + replacements: HashMap, +) -> Result { + let dialect = dialect_from_str(dialect) + .ok_or_else(|| plan_datafusion_err!("Unsupported SQL dialect: {dialect}."))?; + + let replacements = get_tokens_for_string_replacement(dialect.as_ref(), replacements)?; + + let tokens = Tokenizer::new(dialect.as_ref(), query) + .tokenize() + .map_err(|err| DataFusionError::External(err.into()))?; + + let replaced_tokens = tokens + .into_iter() + .flat_map(|token| { + if let Token::Placeholder(placeholder) = &token { + tokens_from_replacements(placeholder, &replacements).unwrap_or(vec![token]) + } else { + vec![token] + } + }) + .collect::>(); + + let statement = Parser::new(dialect.as_ref()) + .with_tokens(replaced_tokens) + .parse_statements() + .map_err(|err| DataFusionError::External(Box::new(err)))?; + + if statement.len() != 1 { + return exec_err!("placeholder replacement should return exactly one statement"); + } + + Ok(statement[0].to_string()) +} diff --git a/src/store.rs b/crates/core/src/store.rs similarity index 88% rename from src/store.rs rename to crates/core/src/store.rs index 998681854..8535e83b7 100644 --- a/src/store.rs +++ b/crates/core/src/store.rs @@ -17,14 +17,13 @@ use std::sync::Arc; -use pyo3::prelude::*; - use object_store::aws::{AmazonS3, AmazonS3Builder}; use object_store::azure::{MicrosoftAzure, MicrosoftAzureBuilder}; use object_store::gcp::{GoogleCloudStorage, GoogleCloudStorageBuilder}; use object_store::http::{HttpBuilder, HttpStore}; use object_store::local::LocalFileSystem; use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; use url::Url; #[derive(FromPyObject)] @@ -37,6 +36,7 @@ pub enum StorageContexts { } #[pyclass( + from_py_object, frozen, name = "LocalFileSystem", module = "datafusion.store", @@ -67,7 +67,13 @@ impl PyLocalFileSystemContext { } } -#[pyclass(frozen, name = "MicrosoftAzure", module = "datafusion.store", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "MicrosoftAzure", + module = "datafusion.store", + subclass +)] #[derive(Debug, Clone)] pub struct PyMicrosoftAzureContext { pub inner: Arc, @@ -77,7 +83,7 @@ pub struct PyMicrosoftAzureContext { #[pymethods] impl PyMicrosoftAzureContext { #[allow(clippy::too_many_arguments)] - #[pyo3(signature = (container_name, account=None, access_key=None, bearer_token=None, client_id=None, client_secret=None, tenant_id=None, sas_query_pairs=None, use_emulator=None, allow_http=None))] + #[pyo3(signature = (container_name, account=None, access_key=None, bearer_token=None, client_id=None, client_secret=None, tenant_id=None, sas_query_pairs=None, use_emulator=None, allow_http=None, use_fabric_endpoint=None))] #[new] fn new( container_name: String, @@ -90,6 +96,7 @@ impl PyMicrosoftAzureContext { sas_query_pairs: Option>, use_emulator: Option, allow_http: Option, + use_fabric_endpoint: Option, ) -> Self { let mut builder = MicrosoftAzureBuilder::from_env().with_container_name(&container_name); @@ -128,6 +135,10 @@ impl PyMicrosoftAzureContext { builder = builder.with_allow_http(allow_http); } + if let Some(use_fabric_endpoint) = use_fabric_endpoint { + builder = builder.with_use_fabric_endpoint(use_fabric_endpoint); + } + Self { inner: Arc::new( builder @@ -139,7 +150,13 @@ impl PyMicrosoftAzureContext { } } -#[pyclass(frozen, name = "GoogleCloud", module = "datafusion.store", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "GoogleCloud", + module = "datafusion.store", + subclass +)] #[derive(Debug, Clone)] pub struct PyGoogleCloudContext { pub inner: Arc, @@ -169,7 +186,13 @@ impl PyGoogleCloudContext { } } -#[pyclass(frozen, name = "AmazonS3", module = "datafusion.store", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "AmazonS3", + module = "datafusion.store", + subclass +)] #[derive(Debug, Clone)] pub struct PyAmazonS3Context { pub inner: Arc, @@ -179,13 +202,14 @@ pub struct PyAmazonS3Context { #[pymethods] impl PyAmazonS3Context { #[allow(clippy::too_many_arguments)] - #[pyo3(signature = (bucket_name, region=None, access_key_id=None, secret_access_key=None, endpoint=None, allow_http=false, imdsv1_fallback=false))] + #[pyo3(signature = (bucket_name, region=None, access_key_id=None, secret_access_key=None, session_token=None, endpoint=None, allow_http=false, imdsv1_fallback=false))] #[new] fn new( bucket_name: String, region: Option, access_key_id: Option, secret_access_key: Option, + session_token: Option, endpoint: Option, //retry_config: RetryConfig, allow_http: bool, @@ -206,6 +230,10 @@ impl PyAmazonS3Context { builder = builder.with_secret_access_key(secret_access_key); }; + if let Some(session_token) = session_token { + builder = builder.with_token(session_token); + } + if let Some(endpoint) = endpoint { builder = builder.with_endpoint(endpoint); }; @@ -228,7 +256,13 @@ impl PyAmazonS3Context { } } -#[pyclass(frozen, name = "Http", module = "datafusion.store", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Http", + module = "datafusion.store", + subclass +)] #[derive(Debug, Clone)] pub struct PyHttpContext { pub url: String, diff --git a/src/substrait.rs b/crates/core/src/substrait.rs similarity index 77% rename from src/substrait.rs rename to crates/core/src/substrait.rs index 291892cf8..27e446f48 100644 --- a/src/substrait.rs +++ b/crates/core/src/substrait.rs @@ -15,19 +15,25 @@ // specific language governing permissions and limitations // under the License. -use pyo3::{prelude::*, types::PyBytes}; - -use crate::context::PySessionContext; -use crate::errors::{py_datafusion_err, PyDataFusionError, PyDataFusionResult}; -use crate::sql::logical::PyLogicalPlan; -use crate::utils::wait_for_future; - +use datafusion_python_util::wait_for_future; use datafusion_substrait::logical_plan::{consumer, producer}; use datafusion_substrait::serializer; use datafusion_substrait::substrait::proto::Plan; use prost::Message; +use pyo3::prelude::*; +use pyo3::types::PyBytes; + +use crate::context::PySessionContext; +use crate::errors::{PyDataFusionError, PyDataFusionResult, py_datafusion_err, to_datafusion_err}; +use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Plan", module = "datafusion.substrait", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Plan", + module = "datafusion.substrait", + subclass +)] #[derive(Debug, Clone)] pub struct PyPlan { pub plan: Plan, @@ -35,13 +41,26 @@ pub struct PyPlan { #[pymethods] impl PyPlan { - fn encode(&self, py: Python) -> PyResult { + fn encode(&self, py: Python) -> PyResult> { let mut proto_bytes = Vec::::new(); self.plan .encode(&mut proto_bytes) .map_err(PyDataFusionError::EncodeError)?; Ok(PyBytes::new(py, &proto_bytes).into()) } + + /// Get the JSON representation of the substrait plan + fn to_json(&self) -> PyDataFusionResult { + let json = serde_json::to_string_pretty(&self.plan).map_err(to_datafusion_err)?; + Ok(json) + } + + /// Parse a Substrait Plan from its JSON representation + #[staticmethod] + fn from_json(json: &str) -> PyDataFusionResult { + let plan: Plan = serde_json::from_str(json).map_err(to_datafusion_err)?; + Ok(PyPlan { plan }) + } } impl From for Plan { @@ -59,7 +78,13 @@ impl From for PyPlan { /// A PySubstraitSerializer is a representation of a Serializer that is capable of both serializing /// a `LogicalPlan` instance to Substrait Protobuf bytes and also deserialize Substrait Protobuf bytes /// to a valid `LogicalPlan` instance. -#[pyclass(frozen, name = "Serde", module = "datafusion.substrait", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Serde", + module = "datafusion.substrait", + subclass +)] #[derive(Debug, Clone)] pub struct PySubstraitSerializer; @@ -83,7 +108,7 @@ impl PySubstraitSerializer { py: Python, ) -> PyDataFusionResult { PySubstraitSerializer::serialize_bytes(sql, ctx, py).and_then(|proto_bytes| { - let proto_bytes = proto_bytes.bind(py).downcast::().unwrap(); + let proto_bytes = proto_bytes.bind(py).cast::().unwrap(); PySubstraitSerializer::deserialize_bytes(proto_bytes.as_bytes().to_vec(), py) }) } @@ -93,7 +118,7 @@ impl PySubstraitSerializer { sql: &str, ctx: PySessionContext, py: Python, - ) -> PyDataFusionResult { + ) -> PyDataFusionResult> { let proto_bytes: Vec = wait_for_future(py, serializer::serialize_bytes(sql, &ctx.ctx))??; Ok(PyBytes::new(py, &proto_bytes).into()) @@ -112,7 +137,13 @@ impl PySubstraitSerializer { } } -#[pyclass(frozen, name = "Producer", module = "datafusion.substrait", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Producer", + module = "datafusion.substrait", + subclass +)] #[derive(Debug, Clone)] pub struct PySubstraitProducer; @@ -129,7 +160,13 @@ impl PySubstraitProducer { } } -#[pyclass(frozen, name = "Consumer", module = "datafusion.substrait", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Consumer", + module = "datafusion.substrait", + subclass +)] #[derive(Debug, Clone)] pub struct PySubstraitConsumer; diff --git a/crates/core/src/table.rs b/crates/core/src/table.rs new file mode 100644 index 000000000..623349771 --- /dev/null +++ b/crates/core/src/table.rs @@ -0,0 +1,261 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use arrow::pyarrow::ToPyArrow; +use async_trait::async_trait; +use datafusion::catalog::{Session, TableProviderFactory}; +use datafusion::common::Column; +use datafusion::datasource::{TableProvider, TableType}; +use datafusion::logical_expr::{ + CreateExternalTable, Expr, LogicalPlanBuilder, TableProviderFilterPushDown, +}; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::DataFrame; +use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; +use datafusion_python_util::{create_logical_extension_capsule, table_provider_from_pycapsule}; +use pyo3::IntoPyObjectExt; +use pyo3::prelude::*; + +use crate::context::PySessionContext; +use crate::dataframe::PyDataFrame; +use crate::dataset::Dataset; +use crate::errors; +use crate::expr::create_external_table::PyCreateExternalTable; + +/// This struct is used as a common method for all TableProviders, +/// whether they refer to an FFI provider, an internally known +/// implementation, a dataset, or a dataframe view. +#[pyclass( + from_py_object, + frozen, + name = "RawTable", + module = "datafusion.catalog", + subclass +)] +#[derive(Clone)] +pub struct PyTable { + pub table: Arc, +} + +impl PyTable { + pub fn table(&self) -> Arc { + self.table.clone() + } +} + +#[pymethods] +impl PyTable { + /// Instantiate from any Python object that supports any of the table + /// types. We do not know a priori when using this method if the object + /// will be passed a wrapped or raw class. Here we handle all of the + /// following object types: + /// + /// - PyTable (essentially a clone operation), but either raw or wrapped + /// - DataFrame, either raw or wrapped + /// - FFI Table Providers via PyCapsule + /// - PyArrow Dataset objects + #[new] + pub fn new(obj: Bound<'_, PyAny>, session: Option>) -> PyResult { + let py = obj.py(); + if let Ok(py_table) = obj.extract::() { + Ok(py_table) + } else if let Ok(py_table) = obj + .getattr("_inner") + .and_then(|inner| inner.extract::().map_err(Into::::into)) + { + Ok(py_table) + } else if let Ok(py_df) = obj.extract::() { + let provider = py_df.inner_df().as_ref().clone().into_view(); + Ok(PyTable::from(provider)) + } else if let Ok(py_df) = obj + .getattr("df") + .and_then(|inner| inner.extract::().map_err(Into::::into)) + { + let provider = py_df.inner_df().as_ref().clone().into_view(); + Ok(PyTable::from(provider)) + } else if let Some(provider) = { + let session = match session { + Some(session) => session, + None => PySessionContext::global_ctx()?.into_bound_py_any(obj.py())?, + }; + table_provider_from_pycapsule(obj.clone(), session)? + } { + Ok(PyTable::from(provider)) + } else { + let provider = Arc::new(Dataset::new(&obj, py)?) as Arc; + Ok(PyTable::from(provider)) + } + } + + /// Get a reference to the schema for this table + #[getter] + fn schema<'py>(&self, py: Python<'py>) -> PyResult> { + self.table.schema().to_pyarrow(py) + } + + /// Get the type of this table for metadata/catalog purposes. + #[getter] + fn kind(&self) -> &str { + match self.table.table_type() { + TableType::Base => "physical", + TableType::View => "view", + TableType::Temporary => "temporary", + } + } + + fn __repr__(&self) -> PyResult { + let kind = self.kind(); + Ok(format!("Table(kind={kind})")) + } +} + +impl From> for PyTable { + fn from(table: Arc) -> Self { + Self { table } + } +} + +#[derive(Clone, Debug)] +pub(crate) struct TempViewTable { + df: Arc, +} + +/// This is nearly identical to `DataFrameTableProvider` +/// except that it is for temporary tables. +/// Remove when https://github.com/apache/datafusion/issues/18026 +/// closes. +impl TempViewTable { + pub(crate) fn new(df: Arc) -> Self { + Self { df } + } +} + +#[async_trait] +impl TableProvider for TempViewTable { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::new(self.df.schema().as_arrow().clone()) + } + + fn table_type(&self) -> TableType { + TableType::Temporary + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> datafusion::common::Result> { + let filter = filters.iter().cloned().reduce(|acc, new| acc.and(new)); + let plan = self.df.logical_plan().clone(); + let mut plan = LogicalPlanBuilder::from(plan); + + if let Some(filter) = filter { + plan = plan.filter(filter)?; + } + + let mut plan = if let Some(projection) = projection { + // avoiding adding a redundant projection (e.g. SELECT * FROM view) + let current_projection = (0..plan.schema().fields().len()).collect::>(); + if projection == ¤t_projection { + plan + } else { + let fields: Vec = projection + .iter() + .map(|i| { + Expr::Column(Column::from( + self.df.logical_plan().schema().qualified_field(*i), + )) + }) + .collect(); + plan.project(fields)? + } + } else { + plan + }; + + if let Some(limit) = limit { + plan = plan.limit(0, Some(limit))?; + } + + state.create_physical_plan(&plan.build()?).await + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> datafusion::common::Result> { + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) + } +} + +#[derive(Debug)] +pub(crate) struct RustWrappedPyTableProviderFactory { + pub(crate) table_provider_factory: Py, + pub(crate) codec: Arc, +} + +impl RustWrappedPyTableProviderFactory { + pub fn new(table_provider_factory: Py, codec: Arc) -> Self { + Self { + table_provider_factory, + codec, + } + } + + fn create_inner( + &self, + cmd: CreateExternalTable, + codec: Bound, + ) -> PyResult> { + Python::attach(|py| { + let provider = self.table_provider_factory.bind(py); + let cmd = PyCreateExternalTable::from(cmd); + + provider + .call_method1("create", (cmd,)) + .and_then(|t| PyTable::new(t, Some(codec))) + .map(|t| t.table()) + }) + } +} + +#[async_trait] +impl TableProviderFactory for RustWrappedPyTableProviderFactory { + async fn create( + &self, + _: &dyn Session, + cmd: &CreateExternalTable, + ) -> datafusion::common::Result> { + Python::attach(|py| { + let codec = create_logical_extension_capsule(py, self.codec.as_ref()) + .map_err(errors::to_datafusion_err)?; + + self.create_inner(cmd.clone(), codec.into_any()) + .map_err(errors::to_datafusion_err) + }) + } +} diff --git a/src/udaf.rs b/crates/core/src/udaf.rs similarity index 64% rename from src/udaf.rs rename to crates/core/src/udaf.rs index eab4581df..ed26c79cc 100644 --- a/src/udaf.rs +++ b/crates/core/src/udaf.rs @@ -15,65 +15,67 @@ // specific language governing permissions and limitations // under the License. +use std::ptr::NonNull; use std::sync::Arc; -use pyo3::{prelude::*, types::PyTuple}; - -use crate::common::data_type::PyScalarValue; -use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; -use crate::expr::PyExpr; -use crate::utils::{parse_volatility, validate_pycapsule}; -use datafusion::arrow::array::{Array, ArrayRef}; +use datafusion::arrow::array::ArrayRef; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::common::ScalarValue; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{ - create_udaf, Accumulator, AccumulatorFactoryFunction, AggregateUDF, + Accumulator, AccumulatorFactoryFunction, AggregateUDF, AggregateUDFImpl, create_udaf, }; -use datafusion_ffi::udaf::{FFI_AggregateUDF, ForeignAggregateUDF}; -use pyo3::types::PyCapsule; +use datafusion_ffi::udaf::FFI_AggregateUDF; +use datafusion_python_util::{parse_volatility, validate_pycapsule}; +use pyo3::ffi::c_str; +use pyo3::prelude::*; +use pyo3::types::{PyCapsule, PyTuple}; + +use crate::common::data_type::PyScalarValue; +use crate::errors::{PyDataFusionResult, py_datafusion_err, to_datafusion_err}; +use crate::expr::PyExpr; #[derive(Debug)] struct RustAccumulator { - accum: PyObject, + accum: Py, } impl RustAccumulator { - fn new(accum: PyObject) -> Self { + fn new(accum: Py) -> Self { Self { accum } } } impl Accumulator for RustAccumulator { fn state(&mut self) -> Result> { - Python::with_gil(|py| { - self.accum - .bind(py) - .call_method0("state")? - .extract::>() + Python::attach(|py| -> PyResult> { + let values = self.accum.bind(py).call_method0("state")?; + let mut scalars = Vec::new(); + for item in values.try_iter()? { + let item: Bound<'_, PyAny> = item?; + let scalar = item.extract::()?.0; + scalars.push(scalar); + } + Ok(scalars) }) - .map(|v| v.into_iter().map(|x| x.0).collect()) .map_err(|e| DataFusionError::Execution(format!("{e}"))) } fn evaluate(&mut self) -> Result { - Python::with_gil(|py| { - self.accum - .bind(py) - .call_method0("evaluate")? - .extract::() + Python::attach(|py| -> PyResult { + let value = self.accum.bind(py).call_method0("evaluate")?; + value.extract::().map(|v| v.0) }) - .map(|v| v.0) .map_err(|e| DataFusionError::Execution(format!("{e}"))) } fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - Python::with_gil(|py| { + Python::attach(|py| { // 1. cast args to Pyarrow array let py_args = values .iter() - .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) + .map(|arg| arg.to_data().to_pyarrow(py).unwrap()) .collect::>(); let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; @@ -88,13 +90,13 @@ impl Accumulator for RustAccumulator { } fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { - Python::with_gil(|py| { + Python::attach(|py| { // // 1. cast states to Pyarrow arrays - let py_states: Result> = states + let py_states: Result>> = states .iter() .map(|state| { state - .into_data() + .to_data() .to_pyarrow(py) .map_err(|e| DataFusionError::Execution(format!("{e}"))) }) @@ -115,11 +117,11 @@ impl Accumulator for RustAccumulator { } fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> { - Python::with_gil(|py| { + Python::attach(|py| { // 1. cast args to Pyarrow array let py_args = values .iter() - .map(|arg| arg.into_data().to_pyarrow(py).unwrap()) + .map(|arg| arg.to_data().to_pyarrow(py).unwrap()) .collect::>(); let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; @@ -134,7 +136,7 @@ impl Accumulator for RustAccumulator { } fn supports_retract_batch(&self) -> bool { - Python::with_gil( + Python::attach( |py| match self.accum.bind(py).call_method0("supports_retract_batch") { Ok(x) => x.extract().unwrap_or(false), Err(_) => false, @@ -143,9 +145,9 @@ impl Accumulator for RustAccumulator { } } -pub fn to_rust_accumulator(accum: PyObject) -> AccumulatorFactoryFunction { - Arc::new(move |_| -> Result> { - let accum = Python::with_gil(|py| { +pub fn to_rust_accumulator(accum: Py) -> AccumulatorFactoryFunction { + Arc::new(move |_args| -> Result> { + let accum = Python::attach(|py| { accum .call0(py) .map_err(|e| DataFusionError::Execution(format!("{e}"))) @@ -154,8 +156,26 @@ pub fn to_rust_accumulator(accum: PyObject) -> AccumulatorFactoryFunction { }) } +fn aggregate_udf_from_capsule(capsule: &Bound<'_, PyCapsule>) -> PyDataFusionResult { + validate_pycapsule(capsule, "datafusion_aggregate_udf")?; + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_aggregate_udf")))? + .cast(); + let udaf = unsafe { data.as_ref() }; + let udaf: Arc = udaf.into(); + + Ok(AggregateUDF::new_from_shared_impl(udaf)) +} + /// Represents an AggregateUDF -#[pyclass(frozen, name = "AggregateUDF", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "AggregateUDF", + module = "datafusion", + subclass +)] #[derive(Debug, Clone)] pub struct PyAggregateUDF { pub(crate) function: AggregateUDF, @@ -167,7 +187,7 @@ impl PyAggregateUDF { #[pyo3(signature=(name, accumulator, input_type, return_type, state_type, volatility))] fn new( name: &str, - accumulator: PyObject, + accumulator: Py, input_type: PyArrowType>, return_type: PyArrowType, state_type: PyArrowType>, @@ -186,22 +206,22 @@ impl PyAggregateUDF { #[staticmethod] pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.is_instance_of::() { + let capsule = func.cast::().map_err(py_datafusion_err)?; + let function = aggregate_udf_from_capsule(capsule)?; + return Ok(Self { function }); + } + if func.hasattr("__datafusion_aggregate_udf__")? { let capsule = func.getattr("__datafusion_aggregate_udf__")?.call0()?; - let capsule = capsule.downcast::().map_err(py_datafusion_err)?; - validate_pycapsule(capsule, "datafusion_aggregate_udf")?; - - let udaf = unsafe { capsule.reference::() }; - let udaf: ForeignAggregateUDF = udaf.try_into()?; - - Ok(Self { - function: udaf.into(), - }) - } else { - Err(crate::errors::PyDataFusionError::Common( - "__datafusion_aggregate_udf__ does not exist on AggregateUDF object.".to_string(), - )) + let capsule = capsule.cast::().map_err(py_datafusion_err)?; + let function = aggregate_udf_from_capsule(capsule)?; + return Ok(Self { function }); } + + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_aggregate_udf__ does not exist on AggregateUDF object.".to_string(), + )) } /// creates a new PyExpr with the call of the udf diff --git a/crates/core/src/udf.rs b/crates/core/src/udf.rs new file mode 100644 index 000000000..7543f96d4 --- /dev/null +++ b/crates/core/src/udf.rs @@ -0,0 +1,226 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::hash::{Hash, Hasher}; +use std::ptr::NonNull; +use std::sync::Arc; + +use arrow::datatypes::{Field, FieldRef}; +use arrow::pyarrow::ToPyArrow; +use datafusion::arrow::array::{ArrayData, make_array}; +use datafusion::arrow::datatypes::DataType; +use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType}; +use datafusion::common::internal_err; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::{ + ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion_ffi::udf::FFI_ScalarUDF; +use datafusion_python_util::{parse_volatility, validate_pycapsule}; +use pyo3::ffi::c_str; +use pyo3::prelude::*; +use pyo3::types::{PyCapsule, PyTuple}; + +use crate::array::PyArrowArrayExportable; +use crate::errors::{PyDataFusionResult, py_datafusion_err, to_datafusion_err}; +use crate::expr::PyExpr; + +/// This struct holds the Python written function that is a +/// ScalarUDF. +#[derive(Debug)] +struct PythonFunctionScalarUDF { + name: String, + func: Py, + signature: Signature, + return_field: FieldRef, +} + +impl PythonFunctionScalarUDF { + fn new( + name: String, + func: Py, + input_fields: Vec, + return_field: Field, + volatility: Volatility, + ) -> Self { + let input_types = input_fields.iter().map(|f| f.data_type().clone()).collect(); + let signature = Signature::exact(input_types, volatility); + Self { + name, + func, + signature, + return_field: Arc::new(return_field), + } + } +} + +impl Eq for PythonFunctionScalarUDF {} +impl PartialEq for PythonFunctionScalarUDF { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.signature == other.signature + && self.return_field == other.return_field + && Python::attach(|py| self.func.bind(py).eq(other.func.bind(py)).unwrap_or(false)) + } +} + +impl Hash for PythonFunctionScalarUDF { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.signature.hash(state); + self.return_field.hash(state); + + Python::attach(|py| { + let py_hash = self.func.bind(py).hash().unwrap_or(0); // Handle unhashable objects + + state.write_isize(py_hash); + }); + } +} + +impl ScalarUDFImpl for PythonFunctionScalarUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + &self.name + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::common::Result { + internal_err!( + "return_field should not be called when return_field_from_args is implemented." + ) + } + + fn return_field_from_args( + &self, + _args: ReturnFieldArgs, + ) -> datafusion::common::Result { + Ok(Arc::clone(&self.return_field)) + } + + fn invoke_with_args( + &self, + args: ScalarFunctionArgs, + ) -> datafusion::common::Result { + let num_rows = args.number_rows; + Python::attach(|py| { + // 1. cast args to Pyarrow arrays + let py_args = args + .args + .into_iter() + .zip(args.arg_fields) + .map(|(arg, field)| { + let array = arg.to_array(num_rows)?; + PyArrowArrayExportable::new(array, field) + .to_pyarrow(py) + .map_err(to_datafusion_err) + }) + .collect::, _>>()?; + let py_args = PyTuple::new(py, py_args).map_err(to_datafusion_err)?; + + // 2. call function + let value = self + .func + .call(py, py_args, None) + .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; + + // 3. cast to arrow::array::Array + let array_data = ArrayData::from_pyarrow_bound(value.bind(py)) + .map_err(|e| DataFusionError::Execution(format!("{e:?}")))?; + Ok(ColumnarValue::Array(make_array(array_data))) + }) + } +} + +/// Represents a PyScalarUDF +#[pyclass( + from_py_object, + frozen, + name = "ScalarUDF", + module = "datafusion", + subclass +)] +#[derive(Debug, Clone)] +pub struct PyScalarUDF { + pub(crate) function: ScalarUDF, +} + +#[pymethods] +impl PyScalarUDF { + #[new] + #[pyo3(signature=(name, func, input_types, return_type, volatility))] + fn new( + name: String, + func: Py, + input_types: PyArrowType>, + return_type: PyArrowType, + volatility: &str, + ) -> PyResult { + let py_function = PythonFunctionScalarUDF::new( + name, + func, + input_types.0, + return_type.0, + parse_volatility(volatility)?, + ); + let function = ScalarUDF::new_from_impl(py_function); + + Ok(Self { function }) + } + + #[staticmethod] + pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { + if func.hasattr("__datafusion_scalar_udf__")? { + let capsule = func.getattr("__datafusion_scalar_udf__")?.call0()?; + let capsule = capsule.cast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_scalar_udf")?; + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_scalar_udf")))? + .cast(); + let udf = unsafe { data.as_ref() }; + let udf: Arc = udf.into(); + + Ok(Self { + function: ScalarUDF::new_from_shared_impl(udf), + }) + } else { + Err(crate::errors::PyDataFusionError::Common( + "__datafusion_scalar_udf__ does not exist on ScalarUDF object.".to_string(), + )) + } + } + + /// creates a new PyExpr with the call of the udf + #[pyo3(signature = (*args))] + fn __call__(&self, args: Vec) -> PyResult { + let args = args.iter().map(|e| e.expr.clone()).collect(); + Ok(self.function.call(args).into()) + } + + fn __repr__(&self) -> PyResult { + Ok(format!("ScalarUDF({})", self.function.name())) + } +} diff --git a/src/udtf.rs b/crates/core/src/udtf.rs similarity index 62% rename from src/udtf.rs rename to crates/core/src/udtf.rs index f6604e5bc..77c5ffbbc 100644 --- a/src/udtf.rs +++ b/crates/core/src/udtf.rs @@ -15,21 +15,27 @@ // specific language governing permissions and limitations // under the License. -use pyo3::prelude::*; +use std::ptr::NonNull; use std::sync::Arc; -use crate::errors::{py_datafusion_err, to_datafusion_err}; -use crate::expr::PyExpr; -use crate::table::PyTable; -use crate::utils::validate_pycapsule; use datafusion::catalog::{TableFunctionImpl, TableProvider}; use datafusion::error::Result as DataFusionResult; use datafusion::logical_expr::Expr; -use datafusion_ffi::udtf::{FFI_TableFunction, ForeignTableFunction}; -use pyo3::types::{PyCapsule, PyTuple}; +use datafusion_ffi::udtf::FFI_TableFunction; +use datafusion_python_util::validate_pycapsule; +use pyo3::IntoPyObjectExt; +use pyo3::exceptions::{PyImportError, PyTypeError}; +use pyo3::ffi::c_str; +use pyo3::prelude::*; +use pyo3::types::{PyCapsule, PyTuple, PyType}; + +use crate::context::PySessionContext; +use crate::errors::{py_datafusion_err, to_datafusion_err}; +use crate::expr::PyExpr; +use crate::table::PyTable; /// Represents a user defined table function -#[pyclass(frozen, name = "TableFunction", module = "datafusion")] +#[pyclass(from_py_object, frozen, name = "TableFunction", module = "datafusion")] #[derive(Debug, Clone)] pub struct PyTableFunction { pub(crate) name: String, @@ -39,24 +45,44 @@ pub struct PyTableFunction { // TODO: Implement pure python based user defined table functions #[derive(Debug, Clone)] pub(crate) enum PyTableFunctionInner { - PythonFunction(Arc), + PythonFunction(Arc>), FFIFunction(Arc), } #[pymethods] impl PyTableFunction { #[new] - #[pyo3(signature=(name, func))] - pub fn new(name: &str, func: Bound<'_, PyAny>) -> PyResult { + #[pyo3(signature=(name, func, session))] + pub fn new( + name: &str, + func: Bound<'_, PyAny>, + session: Option>, + ) -> PyResult { let inner = if func.hasattr("__datafusion_table_function__")? { - let capsule = func.getattr("__datafusion_table_function__")?.call0()?; - let capsule = capsule.downcast::().map_err(py_datafusion_err)?; + let py = func.py(); + let session = match session { + Some(session) => session, + None => PySessionContext::global_ctx()?.into_bound_py_any(py)?, + }; + let capsule = func + .getattr("__datafusion_table_function__")? + .call1((session,)).map_err(|err| { + if err.get_type(py).is(PyType::new::(py)) { + PyImportError::new_err("Incompatible libraries. DataFusion 52.0.0 introduced an incompatible signature change for table functions. Either downgrade DataFusion or upgrade your function library.") + } else { + err + } + })?; + let capsule = capsule.cast::().map_err(py_datafusion_err)?; validate_pycapsule(capsule, "datafusion_table_function")?; - let ffi_func = unsafe { capsule.reference::() }; - let foreign_func: ForeignTableFunction = ffi_func.to_owned().into(); + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_table_function")))? + .cast(); + let ffi_func = unsafe { data.as_ref() }; + let foreign_func: Arc = ffi_func.to_owned().into(); - PyTableFunctionInner::FFIFunction(Arc::new(foreign_func)) + PyTableFunctionInner::FFIFunction(foreign_func) } else { let py_obj = Arc::new(func.unbind()); PyTableFunctionInner::PythonFunction(py_obj) @@ -83,7 +109,7 @@ impl PyTableFunction { #[allow(clippy::result_large_err)] fn call_python_table_function( - func: &Arc, + func: &Arc>, args: &[Expr], ) -> DataFusionResult> { let args = args @@ -92,12 +118,12 @@ fn call_python_table_function( .collect::>(); // move |args: &[ArrayRef]| -> Result { - Python::with_gil(|py| { + Python::attach(|py| { let py_args = PyTuple::new(py, args)?; let provider_obj = func.call1(py, py_args)?; - let provider = provider_obj.bind(py); + let provider = provider_obj.bind(py).clone(); - Ok::, PyErr>(PyTable::new(provider)?.table) + Ok::, PyErr>(PyTable::new(provider, None)?.table) }) .map_err(to_datafusion_err) } diff --git a/src/udwf.rs b/crates/core/src/udwf.rs similarity index 85% rename from src/udwf.rs rename to crates/core/src/udwf.rs index ceeaa0ef1..ff7ab0352 100644 --- a/src/udwf.rs +++ b/crates/core/src/udwf.rs @@ -17,48 +17,50 @@ use std::any::Any; use std::ops::Range; +use std::ptr::NonNull; use std::sync::Arc; -use arrow::array::{make_array, Array, ArrayData, ArrayRef}; -use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; -use datafusion::logical_expr::window_state::WindowAggState; -use datafusion::scalar::ScalarValue; -use pyo3::exceptions::PyValueError; -use pyo3::prelude::*; - -use crate::common::data_type::PyScalarValue; -use crate::errors::{py_datafusion_err, to_datafusion_err, PyDataFusionResult}; -use crate::expr::PyExpr; -use crate::utils::{parse_volatility, validate_pycapsule}; +use arrow::array::{Array, ArrayData, ArrayRef, make_array}; use datafusion::arrow::datatypes::DataType; use datafusion::arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow}; use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; use datafusion::logical_expr::ptr_eq::PtrEq; +use datafusion::logical_expr::window_state::WindowAggState; use datafusion::logical_expr::{ PartitionEvaluator, PartitionEvaluatorFactory, Signature, Volatility, WindowUDF, WindowUDFImpl, }; -use datafusion_ffi::udwf::{FFI_WindowUDF, ForeignWindowUDF}; +use datafusion::scalar::ScalarValue; +use datafusion_ffi::udwf::FFI_WindowUDF; +use datafusion_python_util::{parse_volatility, validate_pycapsule}; +use pyo3::exceptions::PyValueError; +use pyo3::ffi::c_str; +use pyo3::prelude::*; use pyo3::types::{PyCapsule, PyList, PyTuple}; +use crate::common::data_type::PyScalarValue; +use crate::errors::{PyDataFusionResult, py_datafusion_err, to_datafusion_err}; +use crate::expr::PyExpr; + #[derive(Debug)] struct RustPartitionEvaluator { - evaluator: PyObject, + evaluator: Py, } impl RustPartitionEvaluator { - fn new(evaluator: PyObject) -> Self { + fn new(evaluator: Py) -> Self { Self { evaluator } } } impl PartitionEvaluator for RustPartitionEvaluator { fn memoize(&mut self, _state: &mut WindowAggState) -> Result<()> { - Python::with_gil(|py| self.evaluator.bind(py).call_method0("memoize").map(|_| ())) + Python::attach(|py| self.evaluator.bind(py).call_method0("memoize").map(|_| ())) .map_err(|e| DataFusionError::Execution(format!("{e}"))) } fn get_range(&self, idx: usize, n_rows: usize) -> Result> { - Python::with_gil(|py| { + Python::attach(|py| { let py_args = vec![idx.into_pyobject(py)?, n_rows.into_pyobject(py)?]; let py_args = PyTuple::new(py, py_args)?; @@ -84,7 +86,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { } fn is_causal(&self) -> bool { - Python::with_gil(|py| { + Python::attach(|py| { self.evaluator .bind(py) .call_method0("is_causal") @@ -94,8 +96,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { } fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result { - println!("evaluate all called with number of values {}", values.len()); - Python::with_gil(|py| { + Python::attach(|py| { let py_values = PyList::new( py, values @@ -117,7 +118,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { } fn evaluate(&mut self, values: &[ArrayRef], range: &Range) -> Result { - Python::with_gil(|py| { + Python::attach(|py| { let py_values = PyList::new( py, values @@ -141,7 +142,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { num_rows: usize, ranks_in_partition: &[Range], ) -> Result { - Python::with_gil(|py| { + Python::attach(|py| { let ranks = ranks_in_partition .iter() .map(|r| PyTuple::new(py, vec![r.start, r.end])) @@ -168,7 +169,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { } fn supports_bounded_execution(&self) -> bool { - Python::with_gil(|py| { + Python::attach(|py| { self.evaluator .bind(py) .call_method0("supports_bounded_execution") @@ -178,7 +179,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { } fn uses_window_frame(&self) -> bool { - Python::with_gil(|py| { + Python::attach(|py| { self.evaluator .bind(py) .call_method0("uses_window_frame") @@ -188,7 +189,7 @@ impl PartitionEvaluator for RustPartitionEvaluator { } fn include_rank(&self) -> bool { - Python::with_gil(|py| { + Python::attach(|py| { self.evaluator .bind(py) .call_method0("include_rank") @@ -198,9 +199,9 @@ impl PartitionEvaluator for RustPartitionEvaluator { } } -pub fn to_rust_partition_evaluator(evaluator: PyObject) -> PartitionEvaluatorFactory { +pub fn to_rust_partition_evaluator(evaluator: Py) -> PartitionEvaluatorFactory { Arc::new(move || -> Result> { - let evaluator = Python::with_gil(|py| { + let evaluator = Python::attach(|py| { evaluator .call0(py) .map_err(|e| DataFusionError::Execution(e.to_string())) @@ -210,7 +211,13 @@ pub fn to_rust_partition_evaluator(evaluator: PyObject) -> PartitionEvaluatorFac } /// Represents an WindowUDF -#[pyclass(frozen, name = "WindowUDF", module = "datafusion", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "WindowUDF", + module = "datafusion", + subclass +)] #[derive(Debug, Clone)] pub struct PyWindowUDF { pub(crate) function: WindowUDF, @@ -222,7 +229,7 @@ impl PyWindowUDF { #[pyo3(signature=(name, evaluator, input_types, return_type, volatility))] fn new( name: &str, - evaluator: PyObject, + evaluator: Py, input_types: Vec>, return_type: PyArrowType, volatility: &str, @@ -249,22 +256,24 @@ impl PyWindowUDF { #[staticmethod] pub fn from_pycapsule(func: Bound<'_, PyAny>) -> PyDataFusionResult { - if func.hasattr("__datafusion_window_udf__")? { - let capsule = func.getattr("__datafusion_window_udf__")?.call0()?; - let capsule = capsule.downcast::().map_err(py_datafusion_err)?; - validate_pycapsule(capsule, "datafusion_window_udf")?; + let capsule = if func.hasattr("__datafusion_window_udf__")? { + func.getattr("__datafusion_window_udf__")?.call0()? + } else { + func + }; - let udwf = unsafe { capsule.reference::() }; - let udwf: ForeignWindowUDF = udwf.try_into()?; + let capsule = capsule.cast::().map_err(py_datafusion_err)?; + validate_pycapsule(capsule, "datafusion_window_udf")?; - Ok(Self { - function: udwf.into(), - }) - } else { - Err(crate::errors::PyDataFusionError::Common( - "__datafusion_window_udf__ does not exist on WindowUDF object.".to_string(), - )) - } + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_window_udf")))? + .cast(); + let udwf = unsafe { data.as_ref() }; + let udwf: Arc = udwf.into(); + + Ok(Self { + function: WindowUDF::new_from_shared_impl(udwf), + }) } fn __repr__(&self) -> PyResult { diff --git a/src/unparser/dialect.rs b/crates/core/src/unparser/dialect.rs similarity index 93% rename from src/unparser/dialect.rs rename to crates/core/src/unparser/dialect.rs index 5df0a0c2e..52a2da00b 100644 --- a/src/unparser/dialect.rs +++ b/crates/core/src/unparser/dialect.rs @@ -22,7 +22,13 @@ use datafusion::sql::unparser::dialect::{ }; use pyo3::prelude::*; -#[pyclass(frozen, name = "Dialect", module = "datafusion.unparser", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Dialect", + module = "datafusion.unparser", + subclass +)] #[derive(Clone)] pub struct PyDialect { pub dialect: Arc, diff --git a/src/unparser/mod.rs b/crates/core/src/unparser/mod.rs similarity index 87% rename from src/unparser/mod.rs rename to crates/core/src/unparser/mod.rs index f234345a7..5142b918e 100644 --- a/src/unparser/mod.rs +++ b/crates/core/src/unparser/mod.rs @@ -19,13 +19,21 @@ mod dialect; use std::sync::Arc; -use datafusion::sql::unparser::{dialect::Dialect, Unparser}; +use datafusion::sql::unparser::Unparser; +use datafusion::sql::unparser::dialect::Dialect; use dialect::PyDialect; -use pyo3::{exceptions::PyValueError, prelude::*}; +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; use crate::sql::logical::PyLogicalPlan; -#[pyclass(frozen, name = "Unparser", module = "datafusion.unparser", subclass)] +#[pyclass( + from_py_object, + frozen, + name = "Unparser", + module = "datafusion.unparser", + subclass +)] #[derive(Clone)] pub struct PyUnparser { dialect: Arc, diff --git a/crates/util/Cargo.toml b/crates/util/Cargo.toml new file mode 100644 index 000000000..00d5946a5 --- /dev/null +++ b/crates/util/Cargo.toml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-python-util" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +description.workspace = true +homepage.workspace = true +repository.workspace = true + +[dependencies] +tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread"] } +pyo3 = { workspace = true } +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } +arrow = { workspace = true } +prost = { workspace = true } diff --git a/src/errors.rs b/crates/util/src/errors.rs similarity index 87% rename from src/errors.rs rename to crates/util/src/errors.rs index d4f4f221d..0d25c8847 100644 --- a/src/errors.rs +++ b/crates/util/src/errors.rs @@ -22,7 +22,8 @@ use std::fmt::Debug; use datafusion::arrow::error::ArrowError; use datafusion::error::DataFusionError as InnerDataFusionError; use prost::EncodeError; -use pyo3::{exceptions::PyException, PyErr}; +use pyo3::PyErr; +use pyo3::exceptions::{PyException, PyValueError}; pub type PyDataFusionResult = std::result::Result; @@ -38,7 +39,7 @@ pub enum PyDataFusionError { impl fmt::Display for PyDataFusionError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - PyDataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {e:?}"), + PyDataFusionError::ExecutionError(e) => write!(f, "DataFusion error: {e}"), PyDataFusionError::ArrowError(e) => write!(f, "Arrow error: {e:?}"), PyDataFusionError::PythonError(e) => write!(f, "Python error {e:?}"), PyDataFusionError::Common(e) => write!(f, "{e}"), @@ -95,3 +96,13 @@ pub fn py_unsupported_variant_err(e: impl Debug) -> PyErr { pub fn to_datafusion_err(e: impl Debug) -> InnerDataFusionError { InnerDataFusionError::Execution(format!("{e:?}")) } + +pub fn from_datafusion_error(err: InnerDataFusionError) -> PyErr { + match err { + InnerDataFusionError::External(boxed) => match boxed.downcast::() { + Ok(py_err) => *py_err, + Err(original_boxed) => PyValueError::new_err(format!("{original_boxed}")), + }, + _ => PyValueError::new_err(format!("{err}")), + } +} diff --git a/crates/util/src/lib.rs b/crates/util/src/lib.rs new file mode 100644 index 000000000..2678a6b9a --- /dev/null +++ b/crates/util/src/lib.rs @@ -0,0 +1,231 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::future::Future; +use std::ptr::NonNull; +use std::sync::{Arc, OnceLock}; +use std::time::Duration; + +use datafusion::datasource::TableProvider; +use datafusion::execution::context::SessionContext; +use datafusion::logical_expr::Volatility; +use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; +use datafusion_ffi::table_provider::FFI_TableProvider; +use pyo3::exceptions::{PyImportError, PyTypeError, PyValueError}; +use pyo3::ffi::c_str; +use pyo3::prelude::*; +use pyo3::types::{PyCapsule, PyType}; +use tokio::runtime::Runtime; +use tokio::task::JoinHandle; +use tokio::time::sleep; + +use crate::errors::{PyDataFusionError, PyDataFusionResult, py_datafusion_err, to_datafusion_err}; + +pub mod errors; + +/// Utility to get the Tokio Runtime from Python +#[inline] +pub fn get_tokio_runtime() -> &'static Runtime { + // NOTE: Other pyo3 python libraries have had issues with using tokio + // behind a forking app-server like `gunicorn` + // If we run into that problem, in the future we can look to `delta-rs` + // which adds a check in that disallows calls from a forked process + // https://github.com/delta-io/delta-rs/blob/87010461cfe01563d91a4b9cd6fa468e2ad5f283/python/src/utils.rs#L10-L31 + static RUNTIME: OnceLock = OnceLock::new(); + RUNTIME.get_or_init(|| Runtime::new().unwrap()) +} + +#[inline] +pub fn is_ipython_env(py: Python) -> &'static bool { + static IS_IPYTHON_ENV: OnceLock = OnceLock::new(); + IS_IPYTHON_ENV.get_or_init(|| { + py.import("IPython") + .and_then(|ipython| ipython.call_method0("get_ipython")) + .map(|ipython| !ipython.is_none()) + .unwrap_or(false) + }) +} + +/// Utility to get the Global Datafussion CTX +#[inline] +pub fn get_global_ctx() -> &'static Arc { + static CTX: OnceLock> = OnceLock::new(); + CTX.get_or_init(|| Arc::new(SessionContext::new())) +} + +/// Utility to collect rust futures with GIL released and respond to +/// Python interrupts such as ``KeyboardInterrupt``. If a signal is +/// received while the future is running, the future is aborted and the +/// corresponding Python exception is raised. +pub fn wait_for_future(py: Python, fut: F) -> PyResult +where + F: Future + Send, + F::Output: Send, +{ + let runtime: &Runtime = get_tokio_runtime(); + const INTERVAL_CHECK_SIGNALS: Duration = Duration::from_millis(1_000); + + // Some fast running processes that generate many `wait_for_future` calls like + // PartitionedDataFrameStreamReader::next require checking for interrupts early + py.run(cr"pass", None, None)?; + py.check_signals()?; + + py.detach(|| { + runtime.block_on(async { + tokio::pin!(fut); + loop { + tokio::select! { + res = &mut fut => break Ok(res), + _ = sleep(INTERVAL_CHECK_SIGNALS) => { + Python::attach(|py| { + // Execute a no-op Python statement to trigger signal processing. + // This is necessary because py.check_signals() alone doesn't + // actually check for signals - it only raises an exception if + // a signal was already set during a previous Python API call. + // Running even trivial Python code forces the interpreter to + // process any pending signals (like KeyboardInterrupt). + py.run(cr"pass", None, None)?; + py.check_signals() + })?; + } + } + } + }) + }) +} + +/// Spawn a [`Future`] on the Tokio runtime and wait for completion +/// while respecting Python signal handling. +pub fn spawn_future(py: Python, fut: F) -> PyDataFusionResult +where + F: Future> + Send + 'static, + T: Send + 'static, +{ + let rt = get_tokio_runtime(); + let handle: JoinHandle> = rt.spawn(fut); + // Wait for the join handle while respecting Python signal handling. + // We handle errors in two steps so `?` maps the error types correctly: + // 1) convert any Python-related error from `wait_for_future` into `PyDataFusionError` + // 2) convert any DataFusion error (inner result) into `PyDataFusionError` + let inner_result = wait_for_future(py, async { + // handle.await yields `Result, JoinError>` + // map JoinError into a DataFusion error so the async block returns + // `datafusion::common::Result` (i.e. Result) + match handle.await { + Ok(inner) => inner, + Err(join_err) => Err(to_datafusion_err(join_err)), + } + })?; // converts PyErr -> PyDataFusionError + + // `inner_result` is `datafusion::common::Result`; use `?` to convert + // the inner DataFusion error into `PyDataFusionError` via `From` and + // return the inner `T` on success. + Ok(inner_result?) +} + +pub fn parse_volatility(value: &str) -> PyDataFusionResult { + Ok(match value { + "immutable" => Volatility::Immutable, + "stable" => Volatility::Stable, + "volatile" => Volatility::Volatile, + value => { + return Err(PyDataFusionError::Common(format!( + "Unsupported volatility type: `{value}`, supported \ + values are: immutable, stable and volatile." + ))); + } + }) +} + +pub fn validate_pycapsule(capsule: &Bound, name: &str) -> PyResult<()> { + let capsule_name = capsule.name()?; + if capsule_name.is_none() { + return Err(PyValueError::new_err(format!( + "Expected {name} PyCapsule to have name set." + ))); + } + + let capsule_name = unsafe { capsule_name.unwrap().as_cstr().to_str()? }; + if capsule_name != name { + return Err(PyValueError::new_err(format!( + "Expected name '{name}' in PyCapsule, instead got '{capsule_name}'" + ))); + } + + Ok(()) +} + +pub fn table_provider_from_pycapsule<'py>( + mut obj: Bound<'py, PyAny>, + session: Bound<'py, PyAny>, +) -> PyResult>> { + if obj.hasattr("__datafusion_table_provider__")? { + obj = obj + .getattr("__datafusion_table_provider__")? + .call1((session,)).map_err(|err| { + let py = obj.py(); + if err.get_type(py).is(PyType::new::(py)) { + PyImportError::new_err("Incompatible libraries. DataFusion 52.0.0 introduced an incompatible signature change for table providers. Either downgrade DataFusion or upgrade your function library.") + } else { + err + } + })?; + } + + if let Ok(capsule) = obj.cast::().map_err(py_datafusion_err) { + validate_pycapsule(capsule, "datafusion_table_provider")?; + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_table_provider")))? + .cast(); + let provider = unsafe { data.as_ref() }; + let provider: Arc = provider.into(); + + Ok(Some(provider)) + } else { + Ok(None) + } +} + +pub fn create_logical_extension_capsule<'py>( + py: Python<'py>, + codec: &FFI_LogicalExtensionCodec, +) -> PyResult> { + let name = cr"datafusion_logical_extension_codec".into(); + let codec = codec.clone(); + + PyCapsule::new(py, codec, Some(name)) +} + +pub fn ffi_logical_codec_from_pycapsule(obj: Bound) -> PyResult { + let attr_name = "__datafusion_logical_extension_codec__"; + let capsule = if obj.hasattr(attr_name)? { + obj.getattr(attr_name)?.call0()? + } else { + obj + }; + + let capsule = capsule.cast::()?; + validate_pycapsule(capsule, "datafusion_logical_extension_codec")?; + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))? + .cast(); + let codec = unsafe { data.as_ref() }; + + Ok(codec.clone()) +} diff --git a/dev/changelog/50.1.0.md b/dev/changelog/50.1.0.md new file mode 100644 index 000000000..3b9ff84ff --- /dev/null +++ b/dev/changelog/50.1.0.md @@ -0,0 +1,57 @@ + + +# Apache DataFusion Python 50.1.0 Changelog + +This release consists of 11 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Breaking changes:** + +- Unify Table representations [#1256](https://github.com/apache/datafusion-python/pull/1256) (timsaucer) + +**Implemented enhancements:** + +- feat: expose DataFrame.write_table [#1264](https://github.com/apache/datafusion-python/pull/1264) (timsaucer) +- feat: expose` DataFrame.parse_sql_expr` [#1274](https://github.com/apache/datafusion-python/pull/1274) (milenkovicm) + +**Other:** + +- Update version number, add changelog [#1249](https://github.com/apache/datafusion-python/pull/1249) (timsaucer) +- Fix drop() method to handle quoted column names consistently [#1242](https://github.com/apache/datafusion-python/pull/1242) (H0TB0X420) +- Make Session Context `pyclass` frozen so interior mutability is only managed by rust [#1248](https://github.com/apache/datafusion-python/pull/1248) (ntjohnson1) +- macos-13 is deprecated [#1259](https://github.com/apache/datafusion-python/pull/1259) (kevinjqliu) +- Freeze PyO3 wrappers & introduce interior mutability to avoid PyO3 borrow errors [#1253](https://github.com/apache/datafusion-python/pull/1253) (kosiew) +- chore: update dependencies [#1269](https://github.com/apache/datafusion-python/pull/1269) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 4 Tim Saucer + 2 Siew Kam Onn + 1 H0TB0X420 + 1 Kevin Liu + 1 Marko Milenković + 1 Nick + 1 kosiew +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/51.0.0.md b/dev/changelog/51.0.0.md new file mode 100644 index 000000000..cc157eb0d --- /dev/null +++ b/dev/changelog/51.0.0.md @@ -0,0 +1,74 @@ + + +# Apache DataFusion Python 51.0.0 Changelog + +This release consists of 23 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Breaking changes:** + +- feat: reduce duplicate fields on join [#1184](https://github.com/apache/datafusion-python/pull/1184) (timsaucer) + +**Implemented enhancements:** + +- feat: expose `select_exprs` method on DataFrame [#1271](https://github.com/apache/datafusion-python/pull/1271) (milenkovicm) +- feat: allow DataFrame.filter to accept SQL strings [#1276](https://github.com/apache/datafusion-python/pull/1276) (K-dash) +- feat: add temporary view option for into_view [#1267](https://github.com/apache/datafusion-python/pull/1267) (timsaucer) +- feat: support session token parameter for AmazonS3 [#1275](https://github.com/apache/datafusion-python/pull/1275) (GCHQDeveloper028) +- feat: `with_column` supports SQL expression [#1284](https://github.com/apache/datafusion-python/pull/1284) (milenkovicm) +- feat: Add SQL expression for `repartition_by_hash` [#1285](https://github.com/apache/datafusion-python/pull/1285) (milenkovicm) +- feat: Add SQL expression support for `with_columns` [#1286](https://github.com/apache/datafusion-python/pull/1286) (milenkovicm) + +**Fixed bugs:** + +- fix: use coalesce instead of drop_duplicate_keys for join [#1318](https://github.com/apache/datafusion-python/pull/1318) (mesejo) +- fix: Inconsistent schemas when converting to pyarrow [#1315](https://github.com/apache/datafusion-python/pull/1315) (nuno-faria) + +**Other:** + +- Release 50.1 [#1281](https://github.com/apache/datafusion-python/pull/1281) (timsaucer) +- Update python minimum version to 3.10 [#1296](https://github.com/apache/datafusion-python/pull/1296) (timsaucer) +- chore: update datafusion minor version [#1297](https://github.com/apache/datafusion-python/pull/1297) (timsaucer) +- Enable remaining pylints [#1298](https://github.com/apache/datafusion-python/pull/1298) (timsaucer) +- Add Arrow C streaming, DataFrame iteration, and OOM-safe streaming execution [#1222](https://github.com/apache/datafusion-python/pull/1222) (kosiew) +- Add PyCapsule Type Support and Type Hint Enhancements for AggregateUDF in DataFusion Python Bindings [#1277](https://github.com/apache/datafusion-python/pull/1277) (kosiew) +- Add collect_column to dataframe [#1302](https://github.com/apache/datafusion-python/pull/1302) (timsaucer) +- chore: apply cargo fmt with import organization [#1303](https://github.com/apache/datafusion-python/pull/1303) (timsaucer) +- Feat/parameterized sql queries [#964](https://github.com/apache/datafusion-python/pull/964) (timsaucer) +- Upgrade to Datafusion 51 [#1311](https://github.com/apache/datafusion-python/pull/1311) (nuno-faria) +- minor: resolve build errors after latest merge into main [#1325](https://github.com/apache/datafusion-python/pull/1325) (timsaucer) +- Update build workflow link [#1330](https://github.com/apache/datafusion-python/pull/1330) (timsaucer) +- Do not convert pyarrow scalar values to plain python types when passing as `lit` [#1319](https://github.com/apache/datafusion-python/pull/1319) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 12 Tim Saucer + 4 Marko Milenković + 2 Nuno Faria + 2 kosiew + 1 Daniel Mesejo + 1 GCHQDeveloper028 + 1 𝕂 +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/52.0.0.md b/dev/changelog/52.0.0.md new file mode 100644 index 000000000..3f848bb47 --- /dev/null +++ b/dev/changelog/52.0.0.md @@ -0,0 +1,78 @@ + + +# Apache DataFusion Python 52.0.0 Changelog + +This release consists of 26 commits from 9 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: add CatalogProviderList support [#1363](https://github.com/apache/datafusion-python/pull/1363) (timsaucer) +- feat: add support for generating JSON formatted substrait plan [#1376](https://github.com/apache/datafusion-python/pull/1376) (Prathamesh9284) +- feat: add regexp_instr function [#1382](https://github.com/apache/datafusion-python/pull/1382) (mesejo) + +**Fixed bugs:** + +- fix: mangled errors [#1377](https://github.com/apache/datafusion-python/pull/1377) (mesejo) + +**Documentation updates:** + +- docs: Clarify first_value usage in select vs aggregate [#1348](https://github.com/apache/datafusion-python/pull/1348) (AdMub) + +**Other:** + +- Release 51.0.0 [#1333](https://github.com/apache/datafusion-python/pull/1333) (timsaucer) +- Use explicit timer in unit test [#1338](https://github.com/apache/datafusion-python/pull/1338) (timsaucer) +- Add use_fabric_endpoint parameter to MicrosoftAzure class [#1357](https://github.com/apache/datafusion-python/pull/1357) (djouallah) +- Prepare for DF52 release [#1337](https://github.com/apache/datafusion-python/pull/1337) (timsaucer) +- build(deps): bump actions/checkout from 5 to 6 [#1310](https://github.com/apache/datafusion-python/pull/1310) (dependabot[bot]) +- build(deps): bump actions/download-artifact from 5 to 7 [#1321](https://github.com/apache/datafusion-python/pull/1321) (dependabot[bot]) +- build(deps): bump actions/upload-artifact from 4 to 6 [#1322](https://github.com/apache/datafusion-python/pull/1322) (dependabot[bot]) +- build(deps): bump actions/cache from 4 to 5 [#1323](https://github.com/apache/datafusion-python/pull/1323) (dependabot[bot]) +- Pass Field information back and forth when using scalar UDFs [#1299](https://github.com/apache/datafusion-python/pull/1299) (timsaucer) +- Update dependency minor versions to prepare for DF52 release [#1368](https://github.com/apache/datafusion-python/pull/1368) (timsaucer) +- Improve displayed error by using `DataFusionError`'s `Display` trait [#1370](https://github.com/apache/datafusion-python/pull/1370) (abey79) +- Enforce DataFrame display memory limits with `max_rows` + `min_rows` constraint (deprecate `repr_rows`) [#1367](https://github.com/apache/datafusion-python/pull/1367) (kosiew) +- Implement all CSV reader options [#1361](https://github.com/apache/datafusion-python/pull/1361) (timsaucer) +- chore: add confirmation before tarball is released [#1372](https://github.com/apache/datafusion-python/pull/1372) (milenkovicm) +- Build in debug mode for PRs [#1375](https://github.com/apache/datafusion-python/pull/1375) (timsaucer) +- minor: remove ffi test wheel from distribution artifact [#1378](https://github.com/apache/datafusion-python/pull/1378) (timsaucer) +- chore: update rust 2024 edition [#1371](https://github.com/apache/datafusion-python/pull/1371) (timsaucer) +- Fix Python UDAF list-of-timestamps return by enforcing list-valued scalars and caching PyArrow types [#1347](https://github.com/apache/datafusion-python/pull/1347) (kosiew) +- minor: update cargo dependencies [#1383](https://github.com/apache/datafusion-python/pull/1383) (timsaucer) +- chore: bump Python version for RAT checking [#1386](https://github.com/apache/datafusion-python/pull/1386) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 13 Tim Saucer + 4 dependabot[bot] + 2 Daniel Mesejo + 2 kosiew + 1 Adisa Mubarak (AdMub) + 1 Antoine Beyeler + 1 Dhanashri Prathamesh Iranna + 1 Marko Milenković + 1 Mimoune +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/check_crates_patch.py b/dev/check_crates_patch.py new file mode 100644 index 000000000..74e489e1f --- /dev/null +++ b/dev/check_crates_patch.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Check that no Cargo.toml files contain [patch.crates-io] entries. + +Release builds must not depend on patched crates. During development it is +common to temporarily patch crates-io dependencies, but those patches must +be removed before creating a release. + +An empty [patch.crates-io] section is allowed. +""" + +import sys +from pathlib import Path + +import tomllib + + +def main() -> int: + errors: list[str] = [] + for cargo_toml in sorted(Path().rglob("Cargo.toml")): + if "target" in cargo_toml.parts: + continue + with Path.open(cargo_toml, "rb") as f: + data = tomllib.load(f) + patch = data.get("patch", {}).get("crates-io", {}) + if patch: + errors.append(str(cargo_toml)) + for name, spec in patch.items(): + errors.append(f" {name} = {spec}") + + if errors: + print("ERROR: Release builds must not contain [patch.crates-io] entries.") + print() + for line in errors: + print(line) + print() + print("Remove all [patch.crates-io] entries before creating a release.") + return 1 + + print("OK: No [patch.crates-io] entries found.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/dev/create_license.py b/dev/create_license.py index 2a67cb8fd..acbf8587c 100644 --- a/dev/create_license.py +++ b/dev/create_license.py @@ -20,12 +20,11 @@ import json import subprocess +from pathlib import Path -subprocess.check_output(["cargo", "install", "cargo-license"]) data = subprocess.check_output( [ - "cargo", - "license", + "cargo-license", "--avoid-build-deps", "--avoid-dev-deps", "--do-not-bundle", @@ -248,5 +247,5 @@ result += "------------------\n\n" result += f"### {name} {version}\n* source: [{repository}]({repository})\n* license: {license}\n\n" -with open("LICENSE.txt", "w") as f: +with Path.open("LICENSE.txt", "w") as f: f.write(result) diff --git a/dev/release/README.md b/dev/release/README.md index 567f03402..ed28f4aa6 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -81,6 +81,9 @@ Generating changelog content ### Update the version number The only place you should need to update the version is in the root `Cargo.toml`. +After updating the toml file, run `cargo update` to update the cargo lock file. +If you do not want to update all the dependencies, you can instead run `cargo build` +which should only update the version number for `datafusion-python`. ### Tag the Repository @@ -151,6 +154,25 @@ This will create a file named `dist/datafusion-0.7.0.tar.gz`. Upload this to tes python3 -m twine upload --repository testpypi dist/datafusion-0.7.0.tar.gz ``` +### Run Verify Release Candidate Workflow + +Before sending the vote email, run the manually triggered GitHub Actions workflow +"Verify Release Candidate" and confirm all matrix jobs pass across the OS/architecture matrix +(for example, Linux, macOS, and Windows runners): + +1. Go to https://github.com/apache/datafusion-python/actions/workflows/verify-release-candidate.yml +2. Click "Run workflow" +3. Set `version` to the release version (for example, `52.0.0`) +4. Set `rc_number` to the RC number (for example, `0`) +5. Wait for all jobs to complete successfully + +Include a short note in the vote email template that this workflow was run across all OS/architecture +matrix entries and that all jobs passed. + +```text +Verification note: The manually triggered "Verify Release Candidate" workflow was run for version and rc_number across all configured OS/architecture matrix entries, and all matrix jobs completed successfully. +``` + ### Send the Email Send the email to start the vote. diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py index 0c9f4c326..72a35212e 100644 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -21,6 +21,7 @@ import re import sys import xml.etree.ElementTree as ET +from pathlib import Path if len(sys.argv) != 3: sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0]) @@ -29,7 +30,7 @@ exclude_globs_filename = sys.argv[1] xml_filename = sys.argv[2] -globs = [line.strip() for line in open(exclude_globs_filename)] +globs = [line.strip() for line in Path.open(exclude_globs_filename)] tree = ET.parse(xml_filename) root = tree.getroot() diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh index 8c305a676..2b82d1bac 100755 --- a/dev/release/release-tarball.sh +++ b/dev/release/release-tarball.sh @@ -43,6 +43,13 @@ fi version=$1 rc=$2 +read -r -p "Proceed to release tarball for ${version}-rc${rc}? [y/N]: " answer +answer=${answer:-no} +if [ "${answer}" != "y" ]; then + echo "Cancelled tarball release!" + exit 1 +fi + tmp_dir=tmp-apache-datafusion-python-dist echo "Recreate temporary directory: ${tmp_dir}" diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 2bfce0e2d..9591e0335 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -112,8 +112,17 @@ test_source_distribution() { curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path - export PATH=$RUSTUP_HOME/bin:$PATH - source $RUSTUP_HOME/env + # On Unix, rustup creates an env file. On Windows GitHub runners (MSYS bash), + # that file may not exist, so fall back to adding Cargo bin directly. + if [ -f "$CARGO_HOME/env" ]; then + # shellcheck disable=SC1090 + source "$CARGO_HOME/env" + elif [ -f "$RUSTUP_HOME/env" ]; then + # shellcheck disable=SC1090 + source "$RUSTUP_HOME/env" + else + export PATH="$CARGO_HOME/bin:$PATH" + fi # build and test rust @@ -126,10 +135,20 @@ test_source_distribution() { git clone https://github.com/apache/parquet-testing.git parquet-testing python3 -m venv .venv - source .venv/bin/activate - python3 -m pip install -U pip - python3 -m pip install -U maturin - maturin develop + if [ -x ".venv/bin/python" ]; then + VENV_PYTHON=".venv/bin/python" + elif [ -x ".venv/Scripts/python.exe" ]; then + VENV_PYTHON=".venv/Scripts/python.exe" + elif [ -x ".venv/Scripts/python" ]; then + VENV_PYTHON=".venv/Scripts/python" + else + echo "Unable to find python executable in virtual environment" + exit 1 + fi + + "$VENV_PYTHON" -m pip install -U pip + "$VENV_PYTHON" -m pip install -U maturin + "$VENV_PYTHON" -m maturin develop #TODO: we should really run tests here as well #python3 -m pytest diff --git a/docs/README.md b/docs/README.md index 2bffea9bd..502f1c2a1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -59,7 +59,7 @@ firefox docs/build/html/index.html This documentation is hosted at https://datafusion.apache.org/python When the PR is merged to the `main` branch of the DataFusion -repository, a [github workflow](https://github.com/apache/datafusion-python/blob/main/.github/workflows/docs.yaml) which: +repository, a [github workflow](https://github.com/apache/datafusion-python/blob/main/.github/workflows/build.yml) which: 1. Builds the html content 2. Pushes the html content to the [`asf-site`](https://github.com/apache/datafusion-python/tree/asf-site) branch in this repository. @@ -67,4 +67,4 @@ repository, a [github workflow](https://github.com/apache/datafusion-python/blob The Apache Software Foundation provides https://arrow.apache.org/, which serves content based on the configuration in [.asf.yaml](https://github.com/apache/datafusion-python/blob/main/.asf.yaml), -which specifies the target as https://datafusion.apache.org/python. \ No newline at end of file +which specifies the target as https://datafusion.apache.org/python. diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst index 64413866f..e0158e0a2 100644 --- a/docs/source/contributor-guide/ffi.rst +++ b/docs/source/contributor-guide/ffi.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _ffi: + Python Extensions ================= @@ -154,7 +156,7 @@ instead of mutating the container directly: .. code-block:: rust - #[pyclass(name = "Config", module = "datafusion", subclass, frozen)] + #[pyclass(from_py_object, name = "Config", module = "datafusion", subclass, frozen)] #[derive(Clone)] pub(crate) struct PyConfig { config: Arc>, @@ -168,7 +170,7 @@ existing instance in place: .. code-block:: rust - #[pyclass(frozen, name = "SessionContext", module = "datafusion", subclass)] + #[pyclass(from_py_object, frozen, name = "SessionContext", module = "datafusion", subclass)] #[derive(Clone)] pub struct PySessionContext { pub ctx: SessionContext, @@ -184,7 +186,7 @@ field updates: // TODO: This looks like this needs pyo3 tracking so leaving unfrozen for now #[derive(Debug, Clone)] - #[pyclass(name = "DataTypeMap", module = "datafusion.common", subclass)] + #[pyclass(from_py_object, name = "DataTypeMap", module = "datafusion.common", subclass)] pub struct DataTypeMap { #[pyo3(get, set)] pub arrow_type: PyDataType, @@ -230,8 +232,11 @@ can then be turned into a ``ForeignTableProvider`` the associated code is: .. code-block:: rust - let capsule = capsule.downcast::()?; - let provider = unsafe { capsule.reference::() }; + let capsule = capsule.cast::()?; + let data: NonNull = capsule + .pointer_checked(Some(name))? + .cast(); + let codec = unsafe { data.as_ref() }; By convention the ``datafusion-python`` library expects a Python object that has a ``TableProvider`` PyCapsule to have this capsule accessible by calling a function named diff --git a/docs/source/index.rst b/docs/source/index.rst index adec60f48..134d41cb6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -77,6 +77,7 @@ Example user-guide/io/index user-guide/configuration user-guide/sql + user-guide/upgrade-guides .. _toc.contributor_guide: diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst index 40d922150..1d9d70385 100644 --- a/docs/source/user-guide/common-operations/joins.rst +++ b/docs/source/user-guide/common-operations/joins.rst @@ -101,4 +101,36 @@ the right table. .. ipython:: python - left.join(right, left_on="customer_id", right_on="id", how="anti") \ No newline at end of file + left.join(right, left_on="customer_id", right_on="id", how="anti") + +Duplicate Keys +-------------- + +It is common to join two DataFrames on a common column name. Starting in +version 51.0.0, ``datafusion-python``` will now coalesce on column with identical names by +default. This reduces problems with ambiguous column selection after joins. +You can disable this feature by setting the parameter ``coalesce_duplicate_keys`` +to ``False``. + +.. ipython:: python + + left = ctx.from_pydict( + { + "id": [1, 2, 3], + "customer": ["Alice", "Bob", "Charlie"], + } + ) + + right = ctx.from_pylist([ + {"id": 1, "name": "CityCabs"}, + {"id": 2, "name": "MetroRide"}, + {"id": 5, "name": "UrbanGo"}, + ]) + + left.join(right, "id", how="inner") + +In contrast to the above example, if we wish to get both columns: + +.. ipython:: python + + left.join(right, "id", how="inner", coalesce_duplicate_keys=False) diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst index 0830fa81c..f669721a3 100644 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ b/docs/source/user-guide/common-operations/udf-and-udfa.rst @@ -90,6 +90,17 @@ converting to Python objects to do the evaluation. df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show() +In this example we passed the PyArrow ``DataType`` when we defined the function +by calling ``udf()``. If you need additional control, such as specifying +metadata or nullability of the input or output, you can instead specify a +PyArrow ``Field``. + +If you need to write a custom function but do not want to incur the performance +cost of converting to Python objects and back, a more advanced approach is to +write Rust based UDFs and to expose them to Python. There is an example in the +`DataFusion blog `_ +describing how to do this. + Aggregate Functions ------------------- @@ -112,7 +123,7 @@ also see how the inputs to ``update`` and ``merge`` differ. .. code-block:: python - import pyarrow + import pyarrow as pa import pyarrow.compute import datafusion from datafusion import col, udaf, Accumulator @@ -125,16 +136,16 @@ also see how the inputs to ``update`` and ``merge`` differ. def __init__(self): self._sum = 0.0 - def update(self, values_a: pyarrow.Array, values_b: pyarrow.Array) -> None: + def update(self, values_a: pa.Array, values_b: pa.Array) -> None: self._sum = self._sum + pyarrow.compute.sum(values_a).as_py() - pyarrow.compute.sum(values_b).as_py() - def merge(self, states: List[pyarrow.Array]) -> None: + def merge(self, states: list[pa.Array]) -> None: self._sum = self._sum + pyarrow.compute.sum(states[0]).as_py() - def state(self) -> pyarrow.Array: - return pyarrow.array([self._sum]) + def state(self) -> list[pa.Scalar]: + return [pyarrow.scalar(self._sum)] - def evaluate(self) -> pyarrow.Scalar: + def evaluate(self) -> pa.Scalar: return pyarrow.scalar(self._sum) ctx = datafusion.SessionContext() @@ -145,10 +156,29 @@ also see how the inputs to ``update`` and ``merge`` differ. } ) - my_udaf = udaf(MyAccumulator, [pyarrow.float64(), pyarrow.float64()], pyarrow.float64(), [pyarrow.float64()], 'stable') + my_udaf = udaf(MyAccumulator, [pa.float64(), pa.float64()], pa.float64(), [pa.float64()], 'stable') df.aggregate([], [my_udaf(col("a"), col("b")).alias("col_diff")]) +FAQ +^^^ + +**How do I return a list from a UDAF?** + +Both the ``evaluate`` and the ``state`` functions expect to return scalar values. +If you wish to return a list array as a scalar value, the best practice is to +wrap the values in a ``pyarrow.Scalar`` object. For example, you can return a +timestamp list with ``pa.scalar([...], type=pa.list_(pa.timestamp("ms")))`` and +register the appropriate return or state types as +``return_type=pa.list_(pa.timestamp("ms"))`` and +``state_type=[pa.list_(pa.timestamp("ms"))]``, respectively. + +As of DataFusion 52.0.0 , you can pass return any Python object, including a +PyArrow array, as the return value(s) for these functions and DataFusion will +attempt to create a scalar type from the value. DataFusion has been tested to +convert PyArrow, nanoarrow, and arro3 objects as well as primitive data types +like integers, strings, and so on. + Window Functions ---------------- diff --git a/docs/source/user-guide/configuration.rst b/docs/source/user-guide/configuration.rst index 5425a040d..f8e613cd4 100644 --- a/docs/source/user-guide/configuration.rst +++ b/docs/source/user-guide/configuration.rst @@ -15,6 +15,8 @@ .. specific language governing permissions and limitations .. under the License. +.. _configuration: + Configuration ============= diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst index 1387db0bd..510bcbc68 100644 --- a/docs/source/user-guide/dataframe/index.rst +++ b/docs/source/user-guide/dataframe/index.rst @@ -95,8 +95,9 @@ DataFusion's DataFrame API offers a wide range of operations: # Select with expressions df = df.select(column("a") + column("b"), column("a") - column("b")) - # Filter rows + # Filter rows (expressions or SQL strings) df = df.filter(column("age") > literal(25)) + df = df.filter("age > 25") # Add computed columns df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) @@ -195,10 +196,121 @@ To materialize the results of your DataFrame operations: # Display results df.show() # Print tabular format to console - + # Count rows count = df.count() + # Collect a single column of data as a PyArrow Array + arr = df.collect_column("age") + +Zero-copy streaming to Arrow-based Python libraries +--------------------------------------------------- + +DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling +zero-copy, lazy streaming into Arrow-based Python libraries. With the streaming +protocol, batches are produced on demand. + +.. note:: + + The protocol is implementation-agnostic and works with any Python library + that understands the Arrow C streaming interface (for example, PyArrow + or other Arrow-compatible implementations). The sections below provide a + short PyArrow-specific example and general guidance for other + implementations. + +PyArrow +------- + +.. code-block:: python + + import pyarrow as pa + + # Create a PyArrow RecordBatchReader without materializing all batches + reader = pa.RecordBatchReader.from_stream(df) + for batch in reader: + ... # process each batch as it is produced + +DataFrames are also iterable, yielding :class:`datafusion.RecordBatch` +objects lazily so you can loop over results directly without importing +PyArrow: + +.. code-block:: python + + for batch in df: + ... # each batch is a ``datafusion.RecordBatch`` + +Each batch exposes ``to_pyarrow()``, allowing conversion to a PyArrow +table. ``pa.table(df)`` collects the entire DataFrame eagerly into a +PyArrow table: + +.. code-block:: python + + import pyarrow as pa + table = pa.table(df) + +Asynchronous iteration is supported as well, allowing integration with +``asyncio`` event loops: + +.. code-block:: python + + async for batch in df: + ... # process each batch as it is produced + +To work with the stream directly, use ``execute_stream()``, which returns a +:class:`~datafusion.RecordBatchStream`. + +.. code-block:: python + + stream = df.execute_stream() + for batch in stream: + ... + +Execute as Stream +^^^^^^^^^^^^^^^^^ + +For finer control over streaming execution, use +:py:meth:`~datafusion.DataFrame.execute_stream` to obtain a +:py:class:`datafusion.RecordBatchStream`: + +.. code-block:: python + + stream = df.execute_stream() + for batch in stream: + ... # process each batch as it is produced + +.. tip:: + + To get a PyArrow reader instead, call + + ``pa.RecordBatchReader.from_stream(df)``. + +When partition boundaries are important, +:py:meth:`~datafusion.DataFrame.execute_stream_partitioned` +returns an iterable of :py:class:`datafusion.RecordBatchStream` objects, one per +partition: + +.. code-block:: python + + for stream in df.execute_stream_partitioned(): + for batch in stream: + ... # each stream yields RecordBatches + +To process partitions concurrently, first collect the streams into a list +and then poll each one in a separate ``asyncio`` task: + +.. code-block:: python + + import asyncio + + async def consume(stream): + async for batch in stream: + ... + + streams = list(df.execute_stream_partitioned()) + await asyncio.gather(*(consume(s) for s in streams)) + +See :doc:`../io/arrow` for additional details on the Arrow interface. + HTML Rendering -------------- diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index 4c37c7471..9dea948bb 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -57,8 +57,8 @@ You can customize how DataFrames are rendered by configuring the formatter: max_width=1000, # Maximum width in pixels max_height=300, # Maximum height in pixels max_memory_bytes=2097152, # Maximum memory for rendering (2MB) - min_rows_display=20, # Minimum number of rows to display - repr_rows=10, # Number of rows to display in __repr__ + min_rows=10, # Minimum number of rows to display + max_rows=10, # Maximum rows to display in __repr__ enable_cell_expansion=True,# Allow expanding truncated cells custom_css=None, # Additional custom CSS show_truncation_message=True, # Show message when data is truncated @@ -190,8 +190,8 @@ You can control how much data is displayed and how much memory is used for rende configure_formatter( max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display - min_rows_display=50, # Always show at least 50 rows - repr_rows=20 # Show 20 rows in __repr__ output + min_rows=20, # Always show at least 20 rows + max_rows=50 # Show up to 50 rows in output ) These parameters help balance comprehensive data display against performance considerations. diff --git a/docs/source/user-guide/io/arrow.rst b/docs/source/user-guide/io/arrow.rst index d571aa99c..9196fcea7 100644 --- a/docs/source/user-guide/io/arrow.rst +++ b/docs/source/user-guide/io/arrow.rst @@ -60,14 +60,16 @@ Exporting from DataFusion DataFusion DataFrames implement ``__arrow_c_stream__`` PyCapsule interface, so any Python library that accepts these can import a DataFusion DataFrame directly. -.. warning:: - It is important to note that this will cause the DataFrame execution to happen, which may be - a time consuming task. That is, you will cause a - :py:func:`datafusion.dataframe.DataFrame.collect` operation call to occur. +Invoking ``__arrow_c_stream__`` triggers execution of the underlying query, but +batches are yielded incrementally rather than materialized all at once in memory. +Consumers can process the stream as it arrives. The stream executes lazily, +letting downstream readers pull batches on demand. .. ipython:: python + from datafusion import col, lit + df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d")) pa.table(df) diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst index 144b6615c..9c23c291b 100644 --- a/docs/source/user-guide/io/csv.rst +++ b/docs/source/user-guide/io/csv.rst @@ -36,3 +36,25 @@ An alternative is to use :py:func:`~datafusion.context.SessionContext.register_c ctx.register_csv("file", "file.csv") df = ctx.table("file") + +If you require additional control over how to read the CSV file, you can use +:py:class:`~datafusion.options.CsvReadOptions` to set a variety of options. + +.. code-block:: python + + from datafusion import CsvReadOptions + options = ( + CsvReadOptions() + .with_has_header(True) # File contains a header row + .with_delimiter(";") # Use ; as the delimiter instead of , + .with_comment("#") # Skip lines starting with # + .with_escape("\\") # Escape character + .with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL + .with_truncated_rows(True) # Allow rows to have incomplete columns + .with_file_compression_type("gzip") # Read gzipped CSV + .with_file_extension(".gz") # File extension other than .csv + ) + df = ctx.read_csv("data.csv.gz", options=options) + +Details for all CSV reading options can be found on the +`DataFusion documentation site `_. diff --git a/docs/source/user-guide/sql.rst b/docs/source/user-guide/sql.rst index 6fa7f0c6a..b4bfb9611 100644 --- a/docs/source/user-guide/sql.rst +++ b/docs/source/user-guide/sql.rst @@ -23,17 +23,100 @@ DataFusion also offers a SQL API, read the full reference `here `_, +but allow passing named parameters into a SQL query. Consider this simple +example. + +.. ipython:: python + + def show_attacks(ctx: SessionContext, threshold: int) -> None: + ctx.sql( + 'SELECT "Name", "Attack" FROM pokemon WHERE "Attack" > $val', val=threshold + ).show(num=5) + show_attacks(ctx, 75) + +When passing parameters like the example above we convert the Python objects +into their string representation. We also have special case handling +for :py:class:`~datafusion.dataframe.DataFrame` objects, since they cannot simply +be turned into string representations for an SQL query. In these cases we +will register a temporary view in the :py:class:`~datafusion.context.SessionContext` +using a generated table name. + +The formatting for passing string replacement objects is to precede the +variable name with a single ``$``. This works for all dialects in +the SQL parser except ``hive`` and ``mysql``. Since these dialects do not +support named placeholders, we are unable to do this type of replacement. +We recommend either switching to another dialect or using Python +f-string style replacement. + +.. warning:: + + To support DataFrame parameterized queries, your session must support + registration of temporary views. The default + :py:class:`~datafusion.catalog.CatalogProvider` and + :py:class:`~datafusion.catalog.SchemaProvider` do have this capability. + If you have implemented custom providers, it is important that temporary + views do not persist across :py:class:`~datafusion.context.SessionContext` + or you may get unintended consequences. + +The following example shows passing in both a :py:class:`~datafusion.dataframe.DataFrame` +object as well as a Python object to be used in parameterized replacement. + +.. ipython:: python + + def show_column( + ctx: SessionContext, column: str, df: DataFrame, threshold: int + ) -> None: + ctx.sql( + 'SELECT "Name", $col FROM $df WHERE $col > $val', + col=column, + df=df, + val=threshold, + ).show(num=5) + df = ctx.table("pokemon") + show_column(ctx, '"Defense"', df, 75) + +The approach implemented for conversion of variables into a SQL query +relies on string conversion. This has the potential for data loss, +specifically for cases like floating point numbers. If you need to pass +variables into a parameterized query and it is important to maintain the +original value without conversion to a string, then you can use the +optional parameter ``param_values`` to specify these. This parameter +expects a dictionary mapping from the parameter name to a Python +object. Those objects will be cast into a +`PyArrow Scalar Value `_. + +Using ``param_values`` will rely on the SQL dialect you have configured +for your session. This can be set using the :ref:`configuration options ` +of your :py:class:`~datafusion.context.SessionContext`. Similar to how +`prepared statements `_ +work, these parameters are limited to places where you would pass in a +scalar value, such as a comparison. + +.. ipython:: python + + def param_attacks(ctx: SessionContext, threshold: int) -> None: + ctx.sql( + 'SELECT "Name", "Attack" FROM pokemon WHERE "Attack" > $val', + param_values={"val": threshold}, + ).show(num=5) + param_attacks(ctx, 75) diff --git a/docs/source/user-guide/upgrade-guides.rst b/docs/source/user-guide/upgrade-guides.rst new file mode 100644 index 000000000..e3d7c2d87 --- /dev/null +++ b/docs/source/user-guide/upgrade-guides.rst @@ -0,0 +1,117 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Upgrade Guides +============== + +DataFusion 53.0.0 +----------------- + +This version includes an upgraded version of ``pyo3``, which changed the way to extract an FFI +object. Example: + +Before: + +.. code-block:: rust + + let codec = unsafe { capsule.reference::() }; + +Now: + +.. code-block:: rust + + let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))? + .cast(); + let codec = unsafe { data.as_ref() }; + +DataFusion 52.0.0 +----------------- + +This version includes a major update to the :ref:`ffi` due to upgrades +to the `Foreign Function Interface `_. +Users who contribute their own ``CatalogProvider``, ``SchemaProvider``, +``TableProvider`` or ``TableFunction`` via FFI must now provide access to a +``LogicalExtensionCodec`` and a ``TaskContextProvider``. The function signatures +for the methods to get these ``PyCapsule`` objects now requires an additional +parameter, which is a Python object that can be used to extract the +``FFI_LogicalExtensionCodec`` that is necessary. + +A complete example can be found in the `FFI example `_. +Your methods need to be updated to take an additional parameter like in this +example. + +.. code-block:: rust + + #[pymethods] + impl MyCatalogProvider { + pub fn __datafusion_catalog_provider__<'py>( + &self, + py: Python<'py>, + session: Bound, + ) -> PyResult> { + let name = cr"datafusion_catalog_provider".into(); + + let provider = Arc::clone(&self.inner) as Arc; + + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider = FFI_CatalogProvider::new_with_ffi_codec(provider, None, codec); + + PyCapsule::new(py, provider, Some(name)) + } + } + +To extract the logical extension codec FFI object from the provided object you +can implement a helper method such as: + +.. code-block:: rust + + pub(crate) fn ffi_logical_codec_from_pycapsule( + obj: Bound, + ) -> PyResult { + let attr_name = "__datafusion_logical_extension_codec__"; + let capsule = if obj.hasattr(attr_name)? { + obj.getattr(attr_name)?.call0()? + } else { + obj + }; + + let capsule = capsule.downcast::()?; + validate_pycapsule(capsule, "datafusion_logical_extension_codec")?; + + let codec = unsafe { capsule.reference::() }; + + Ok(codec.clone()) + } + + +The DataFusion FFI interface updates no longer depend directly on the +``datafusion`` core crate. You can improve your build times and potentially +reduce your library binary size by removing this dependency and instead +using the specific datafusion project crates. + +For example, instead of including expressions like: + +.. code-block:: rust + + use datafusion::catalog::MemTable; + +Instead you can now write: + +.. code-block:: rust + + use datafusion_catalog::MemTable; diff --git a/examples/csv-read-options.py b/examples/csv-read-options.py new file mode 100644 index 000000000..a5952d950 --- /dev/null +++ b/examples/csv-read-options.py @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Example demonstrating CsvReadOptions usage.""" + +from datafusion import CsvReadOptions, SessionContext + +# Create a SessionContext +ctx = SessionContext() + +# Example 1: Using CsvReadOptions with default values +print("Example 1: Default CsvReadOptions") +options = CsvReadOptions() +df = ctx.read_csv("data.csv", options=options) + +# Example 2: Using CsvReadOptions with custom parameters +print("\nExample 2: Custom CsvReadOptions") +options = CsvReadOptions( + has_header=True, + delimiter=",", + quote='"', + schema_infer_max_records=1000, + file_extension=".csv", +) +df = ctx.read_csv("data.csv", options=options) + +# Example 3: Using the builder pattern (recommended for readability) +print("\nExample 3: Builder pattern") +options = ( + CsvReadOptions() + .with_has_header(True) # noqa: FBT003 + .with_delimiter("|") + .with_quote("'") + .with_schema_infer_max_records(500) + .with_truncated_rows(False) # noqa: FBT003 + .with_newlines_in_values(True) # noqa: FBT003 +) +df = ctx.read_csv("data.csv", options=options) + +# Example 4: Advanced options +print("\nExample 4: Advanced options") +options = ( + CsvReadOptions() + .with_has_header(True) # noqa: FBT003 + .with_delimiter(",") + .with_comment("#") # Skip lines starting with # + .with_escape("\\") # Escape character + .with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL + .with_truncated_rows(True) # noqa: FBT003 + .with_file_compression_type("gzip") # Read gzipped CSV + .with_file_extension(".gz") +) +df = ctx.read_csv("data.csv.gz", options=options) + +# Example 5: Register CSV table with options +print("\nExample 5: Register CSV table") +options = CsvReadOptions().with_has_header(True).with_delimiter(",") # noqa: FBT003 +ctx.register_csv("my_table", "data.csv", options=options) +df = ctx.sql("SELECT * FROM my_table") + +# Example 6: Backward compatibility (without options) +print("\nExample 6: Backward compatibility") +# Still works the old way! +df = ctx.read_csv("data.csv", has_header=True, delimiter=",") + +print("\nAll examples completed!") +print("\nFor all available options, see the CsvReadOptions documentation:") +print(" - has_header: bool") +print(" - delimiter: str") +print(" - quote: str") +print(" - terminator: str | None") +print(" - escape: str | None") +print(" - comment: str | None") +print(" - newlines_in_values: bool") +print(" - schema: pa.Schema | None") +print(" - schema_infer_max_records: int") +print(" - file_extension: str") +print(" - table_partition_cols: list[tuple[str, pa.DataType]]") +print(" - file_compression_type: str") +print(" - file_sort_order: list[list[SortExpr]]") +print(" - null_regex: str | None") +print(" - truncated_rows: bool") diff --git a/examples/datafusion-ffi-example/.cargo/config.toml b/examples/datafusion-ffi-example/.cargo/config.toml deleted file mode 100644 index 91a099a61..000000000 --- a/examples/datafusion-ffi-example/.cargo/config.toml +++ /dev/null @@ -1,12 +0,0 @@ -[target.x86_64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - -[target.aarch64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - diff --git a/examples/datafusion-ffi-example/Cargo.lock b/examples/datafusion-ffi-example/Cargo.lock deleted file mode 100644 index 58849f8e9..000000000 --- a/examples/datafusion-ffi-example/Cargo.lock +++ /dev/null @@ -1,3624 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 4 - -[[package]] -name = "abi_stable" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" -dependencies = [ - "abi_stable_derive", - "abi_stable_shared", - "const_panic", - "core_extensions", - "crossbeam-channel", - "generational-arena", - "libloading", - "lock_api", - "parking_lot", - "paste", - "repr_offset", - "rustc_version", - "serde", - "serde_derive", - "serde_json", -] - -[[package]] -name = "abi_stable_derive" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" -dependencies = [ - "abi_stable_shared", - "as_derive_utils", - "core_extensions", - "proc-macro2", - "quote", - "rustc_version", - "syn 1.0.109", - "typed-arena", -] - -[[package]] -name = "abi_stable_shared" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" -dependencies = [ - "core_extensions", -] - -[[package]] -name = "addr2line" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler2" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" - -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "const-random", - "getrandom 0.3.3", - "once_cell", - "version_check", - "zerocopy", -] - -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] - -[[package]] -name = "alloc-no-stdlib" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" - -[[package]] -name = "alloc-stdlib" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" -dependencies = [ - "alloc-no-stdlib", -] - -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - -[[package]] -name = "android_system_properties" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" -dependencies = [ - "libc", -] - -[[package]] -name = "anyhow" -version = "1.0.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" - -[[package]] -name = "arrayref" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" - -[[package]] -name = "arrayvec" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" - -[[package]] -name = "arrow" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" -dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-csv", - "arrow-data", - "arrow-ipc", - "arrow-json", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", -] - -[[package]] -name = "arrow-arith" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "num", -] - -[[package]] -name = "arrow-array" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" -dependencies = [ - "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "chrono-tz", - "half", - "hashbrown 0.15.5", - "num", -] - -[[package]] -name = "arrow-buffer" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" -dependencies = [ - "bytes", - "half", - "num", -] - -[[package]] -name = "arrow-cast" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "atoi", - "base64", - "chrono", - "comfy-table", - "half", - "lexical-core", - "num", - "ryu", -] - -[[package]] -name = "arrow-csv" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" -dependencies = [ - "arrow-array", - "arrow-cast", - "arrow-schema", - "chrono", - "csv", - "csv-core", - "regex", -] - -[[package]] -name = "arrow-data" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" -dependencies = [ - "arrow-buffer", - "arrow-schema", - "half", - "num", -] - -[[package]] -name = "arrow-ipc" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "flatbuffers", - "lz4_flex", - "zstd", -] - -[[package]] -name = "arrow-json" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-schema", - "chrono", - "half", - "indexmap", - "lexical-core", - "memchr", - "num", - "serde", - "serde_json", - "simdutf8", -] - -[[package]] -name = "arrow-ord" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", -] - -[[package]] -name = "arrow-row" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "half", -] - -[[package]] -name = "arrow-schema" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" -dependencies = [ - "bitflags", - "serde", - "serde_json", -] - -[[package]] -name = "arrow-select" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "num", -] - -[[package]] -name = "arrow-string" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "memchr", - "num", - "regex", - "regex-syntax", -] - -[[package]] -name = "as_derive_utils" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" -dependencies = [ - "core_extensions", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "async-compression" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" -dependencies = [ - "bzip2 0.5.2", - "flate2", - "futures-core", - "memchr", - "pin-project-lite", - "tokio", - "xz2", - "zstd", - "zstd-safe", -] - -[[package]] -name = "async-ffi" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" -dependencies = [ - "abi_stable", -] - -[[package]] -name = "async-trait" -version = "0.1.89" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "atoi" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" -dependencies = [ - "num-traits", -] - -[[package]] -name = "autocfg" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" - -[[package]] -name = "backtrace" -version = "0.3.75" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" -dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-targets 0.52.6", -] - -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - -[[package]] -name = "bigdecimal" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a22f228ab7a1b23027ccc6c350b72868017af7ea8356fbdf19f8d991c690013" -dependencies = [ - "autocfg", - "libm", - "num-bigint", - "num-integer", - "num-traits", -] - -[[package]] -name = "bitflags" -version = "2.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d" - -[[package]] -name = "blake2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" -dependencies = [ - "digest", -] - -[[package]] -name = "blake3" -version = "1.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" -dependencies = [ - "arrayref", - "arrayvec", - "cc", - "cfg-if", - "constant_time_eq", -] - -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "brotli" -version = "8.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", - "brotli-decompressor", -] - -[[package]] -name = "brotli-decompressor" -version = "5.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" -dependencies = [ - "alloc-no-stdlib", - "alloc-stdlib", -] - -[[package]] -name = "bumpalo" -version = "3.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "bytes" -version = "1.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" - -[[package]] -name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - -[[package]] -name = "bzip2" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bea8dcd42434048e4f7a304411d9273a411f647446c1234a65ce0554923f4cff" -dependencies = [ - "libbz2-rs-sys", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - -[[package]] -name = "cc" -version = "1.2.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42bc4aea80032b7bf409b0bc7ccad88853858911b7713a8062fdc0623867bedc" -dependencies = [ - "jobserver", - "libc", - "shlex", -] - -[[package]] -name = "cfg-if" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" - -[[package]] -name = "chrono" -version = "0.4.41" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" -dependencies = [ - "android-tzdata", - "iana-time-zone", - "num-traits", - "windows-link", -] - -[[package]] -name = "chrono-tz" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" -dependencies = [ - "chrono", - "phf", -] - -[[package]] -name = "comfy-table" -version = "7.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" -dependencies = [ - "unicode-segmentation", - "unicode-width", -] - -[[package]] -name = "const-random" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" -dependencies = [ - "const-random-macro", -] - -[[package]] -name = "const-random-macro" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" -dependencies = [ - "getrandom 0.2.16", - "once_cell", - "tiny-keccak", -] - -[[package]] -name = "const_panic" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb8a602185c3c95b52f86dc78e55a6df9a287a7a93ddbcf012509930880cf879" -dependencies = [ - "typewit", -] - -[[package]] -name = "constant_time_eq" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" - -[[package]] -name = "core-foundation-sys" -version = "0.8.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" - -[[package]] -name = "core_extensions" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" -dependencies = [ - "core_extensions_proc_macros", -] - -[[package]] -name = "core_extensions_proc_macros" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" - -[[package]] -name = "cpufeatures" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32fast" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - -[[package]] -name = "crunchy" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" - -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "csv" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" -dependencies = [ - "memchr", -] - -[[package]] -name = "dashmap" -version = "6.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" -dependencies = [ - "cfg-if", - "crossbeam-utils", - "hashbrown 0.14.5", - "lock_api", - "once_cell", - "parking_lot_core", -] - -[[package]] -name = "datafusion" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69dfeda1633bf8ec75b068d9f6c27cdc392ffcf5ff83128d5dbab65b73c1fd02" -dependencies = [ - "arrow", - "arrow-ipc", - "arrow-schema", - "async-trait", - "bytes", - "bzip2 0.6.0", - "chrono", - "datafusion-catalog", - "datafusion-catalog-listing", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-datasource-csv", - "datafusion-datasource-json", - "datafusion-datasource-parquet", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-nested", - "datafusion-functions-table", - "datafusion-functions-window", - "datafusion-optimizer", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", - "flate2", - "futures", - "hex", - "itertools", - "log", - "object_store", - "parking_lot", - "parquet", - "rand", - "regex", - "sqlparser", - "tempfile", - "tokio", - "url", - "uuid", - "xz2", - "zstd", -] - -[[package]] -name = "datafusion-catalog" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2848fd1e85e2953116dab9cc2eb109214b0888d7bbd2230e30c07f1794f642c0" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-session", - "datafusion-sql", - "futures", - "itertools", - "log", - "object_store", - "parking_lot", - "tokio", -] - -[[package]] -name = "datafusion-catalog-listing" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "051a1634628c2d1296d4e326823e7536640d87a118966cdaff069b68821ad53b" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "log", - "object_store", - "tokio", -] - -[[package]] -name = "datafusion-common" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "765e4ad4ef7a4500e389a3f1e738791b71ff4c29fd00912c2f541d62b25da096" -dependencies = [ - "ahash", - "arrow", - "arrow-ipc", - "base64", - "chrono", - "half", - "hashbrown 0.14.5", - "hex", - "indexmap", - "libc", - "log", - "object_store", - "parquet", - "paste", - "recursive", - "sqlparser", - "tokio", - "web-time", -] - -[[package]] -name = "datafusion-common-runtime" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40a2ae8393051ce25d232a6065c4558ab5a535c9637d5373bacfd464ac88ea12" -dependencies = [ - "futures", - "log", - "tokio", -] - -[[package]] -name = "datafusion-datasource" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90cd841a77f378bc1a5c4a1c37345e1885a9203b008203f9f4b3a769729bf330" -dependencies = [ - "arrow", - "async-compression", - "async-trait", - "bytes", - "bzip2 0.6.0", - "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "flate2", - "futures", - "glob", - "itertools", - "log", - "object_store", - "parquet", - "rand", - "tempfile", - "tokio", - "tokio-util", - "url", - "xz2", - "zstd", -] - -[[package]] -name = "datafusion-datasource-csv" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77f4a2c64939c6f0dd15b246723a699fa30d59d0133eb36a86e8ff8c6e2a8dc6" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "regex", - "tokio", -] - -[[package]] -name = "datafusion-datasource-json" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11387aaf931b2993ad9273c63ddca33f05aef7d02df9b70fb757429b4b71cdae" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-session", - "futures", - "object_store", - "serde_json", - "tokio", -] - -[[package]] -name = "datafusion-datasource-parquet" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "028f430c5185120bf806347848b8d8acd9823f4038875b3820eeefa35f2bb4a2" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "datafusion-catalog", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-datasource", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-optimizer", - "datafusion-physical-plan", - "datafusion-pruning", - "datafusion-session", - "futures", - "hex", - "itertools", - "log", - "object_store", - "parking_lot", - "parquet", - "rand", - "tokio", -] - -[[package]] -name = "datafusion-doc" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ff336d1d755399753a9e4fbab001180e346fc8bfa063a97f1214b82274c00f8" - -[[package]] -name = "datafusion-execution" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "042ea192757d1b2d7dcf71643e7ff33f6542c7704f00228d8b85b40003fd8e0f" -dependencies = [ - "arrow", - "dashmap", - "datafusion-common", - "datafusion-expr", - "futures", - "log", - "object_store", - "parking_lot", - "rand", - "tempfile", - "url", -] - -[[package]] -name = "datafusion-expr" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025222545d6d7fab71e2ae2b356526a1df67a2872222cbae7535e557a42abd2e" -dependencies = [ - "arrow", - "async-trait", - "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-functions-window-common", - "datafusion-physical-expr-common", - "indexmap", - "paste", - "recursive", - "serde_json", - "sqlparser", -] - -[[package]] -name = "datafusion-expr-common" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d5c267104849d5fa6d81cf5ba88f35ecd58727729c5eb84066c25227b644ae2" -dependencies = [ - "arrow", - "datafusion-common", - "indexmap", - "itertools", - "paste", -] - -[[package]] -name = "datafusion-ffi" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec21805d9df2d834e4c6ddfbf8a1bed2bd460b89b01686fe0dcd1cee06d0b60f" -dependencies = [ - "abi_stable", - "arrow", - "arrow-schema", - "async-ffi", - "async-trait", - "datafusion", - "datafusion-functions-aggregate-common", - "datafusion-proto", - "datafusion-proto-common", - "futures", - "log", - "prost", - "semver", - "tokio", -] - -[[package]] -name = "datafusion-ffi-example" -version = "0.2.0" -dependencies = [ - "arrow", - "arrow-array", - "arrow-schema", - "async-trait", - "datafusion", - "datafusion-ffi", - "pyo3", - "pyo3-build-config", -] - -[[package]] -name = "datafusion-functions" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c620d105aa208fcee45c588765483314eb415f5571cfd6c1bae3a59c5b4d15bb" -dependencies = [ - "arrow", - "arrow-buffer", - "base64", - "blake2", - "blake3", - "chrono", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-macros", - "hex", - "itertools", - "log", - "md-5", - "rand", - "regex", - "sha2", - "unicode-segmentation", - "uuid", -] - -[[package]] -name = "datafusion-functions-aggregate" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35f61d5198a35ed368bf3aacac74f0d0fa33de7a7cb0c57e9f68ab1346d2f952" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "half", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-aggregate-common" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13efdb17362be39b5024f6da0d977ffe49c0212929ec36eec550e07e2bc7812f" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-functions-nested" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9187678af567d7c9e004b72a0b6dc5b0a00ebf4901cb3511ed2db4effe092e66" -dependencies = [ - "arrow", - "arrow-ord", - "datafusion-common", - "datafusion-doc", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions", - "datafusion-functions-aggregate", - "datafusion-functions-aggregate-common", - "datafusion-macros", - "datafusion-physical-expr-common", - "itertools", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-table" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecf156589cc21ef59fe39c7a9a841b4a97394549643bbfa88cc44e8588cf8fe5" -dependencies = [ - "arrow", - "async-trait", - "datafusion-catalog", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-plan", - "parking_lot", - "paste", -] - -[[package]] -name = "datafusion-functions-window" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edcb25e3e369f1366ec9a261456e45b5aad6ea1c0c8b4ce546587207c501ed9e" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-doc", - "datafusion-expr", - "datafusion-functions-window-common", - "datafusion-macros", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "log", - "paste", -] - -[[package]] -name = "datafusion-functions-window-common" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8996a8e11174d0bd7c62dc2f316485affc6ae5ffd5b8a68b508137ace2310294" -dependencies = [ - "datafusion-common", - "datafusion-physical-expr-common", -] - -[[package]] -name = "datafusion-macros" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ee8d1be549eb7316f437035f2cec7ec42aba8374096d807c4de006a3b5d78a" -dependencies = [ - "datafusion-expr", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "datafusion-optimizer" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9fa98671458254928af854e5f6c915e66b860a8bde505baea0ff2892deab74d" -dependencies = [ - "arrow", - "chrono", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "indexmap", - "itertools", - "log", - "recursive", - "regex", - "regex-syntax", -] - -[[package]] -name = "datafusion-physical-expr" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3515d51531cca5f7b5a6f3ea22742b71bb36fc378b465df124ff9a2fa349b002" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr-common", - "half", - "hashbrown 0.14.5", - "indexmap", - "itertools", - "log", - "paste", - "petgraph", -] - -[[package]] -name = "datafusion-physical-expr-common" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24485475d9c618a1d33b2a3dad003d946dc7a7bbf0354d125301abc0a5a79e3e" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr-common", - "hashbrown 0.14.5", - "itertools", -] - -[[package]] -name = "datafusion-physical-optimizer" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9da411a0a64702f941a12af2b979434d14ec5d36c6f49296966b2c7639cbb3a" -dependencies = [ - "arrow", - "datafusion-common", - "datafusion-execution", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "datafusion-pruning", - "itertools", - "log", - "recursive", -] - -[[package]] -name = "datafusion-physical-plan" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6d168282bb7b54880bb3159f89b51c047db4287f5014d60c3ef4c6e1468212b" -dependencies = [ - "ahash", - "arrow", - "arrow-ord", - "arrow-schema", - "async-trait", - "chrono", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-functions-window-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "futures", - "half", - "hashbrown 0.14.5", - "indexmap", - "itertools", - "log", - "parking_lot", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "datafusion-proto" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b36a0c84f4500efd90487a004b533bd81de1f2bb3f143f71b7526f33b85d2e2" -dependencies = [ - "arrow", - "chrono", - "datafusion", - "datafusion-common", - "datafusion-expr", - "datafusion-proto-common", - "object_store", - "prost", -] - -[[package]] -name = "datafusion-proto-common" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ec788be522806740ad6372c0a2f7e45fb37cb37f786d9b77933add49cdd058f" -dependencies = [ - "arrow", - "datafusion-common", - "prost", -] - -[[package]] -name = "datafusion-pruning" -version = "49.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391a457b9d23744c53eeb89edd1027424cba100581488d89800ed841182df905" -dependencies = [ - "arrow", - "arrow-schema", - "datafusion-common", - "datafusion-datasource", - "datafusion-expr-common", - "datafusion-physical-expr", - "datafusion-physical-expr-common", - "datafusion-physical-plan", - "itertools", - "log", -] - -[[package]] -name = "datafusion-session" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "053201c2bb729c7938f85879034df2b5a52cfaba16f1b3b66ab8505c81b2aad3" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common", - "datafusion-common-runtime", - "datafusion-execution", - "datafusion-expr", - "datafusion-physical-expr", - "datafusion-physical-plan", - "datafusion-sql", - "futures", - "itertools", - "log", - "object_store", - "parking_lot", - "tokio", -] - -[[package]] -name = "datafusion-sql" -version = "49.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9082779be8ce4882189b229c0cff4393bd0808282a7194130c9f32159f185e25" -dependencies = [ - "arrow", - "bigdecimal", - "datafusion-common", - "datafusion-expr", - "indexmap", - "log", - "recursive", - "regex", - "sqlparser", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - -[[package]] -name = "displaydoc" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "either" -version = "1.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" - -[[package]] -name = "equivalent" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - -[[package]] -name = "errno" -version = "0.3.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" -dependencies = [ - "libc", - "windows-sys 0.60.2", -] - -[[package]] -name = "fastrand" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" - -[[package]] -name = "fixedbitset" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" - -[[package]] -name = "flatbuffers" -version = "25.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1" -dependencies = [ - "bitflags", - "rustc_version", -] - -[[package]] -name = "flate2" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" -dependencies = [ - "crc32fast", - "libz-rs-sys", - "miniz_oxide", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "foldhash" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" - -[[package]] -name = "form_urlencoded" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "futures" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" - -[[package]] -name = "futures-executor" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" - -[[package]] -name = "futures-macro" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "futures-sink" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" - -[[package]] -name = "futures-task" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" - -[[package]] -name = "futures-util" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "generational-arena" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" -dependencies = [ - "cfg-if", - "js-sys", - "libc", - "wasi 0.11.1+wasi-snapshot-preview1", - "wasm-bindgen", -] - -[[package]] -name = "getrandom" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" -dependencies = [ - "cfg-if", - "libc", - "r-efi", - "wasi 0.14.2+wasi-0.2.4", -] - -[[package]] -name = "gimli" -version = "0.31.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" - -[[package]] -name = "glob" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" - -[[package]] -name = "half" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9" -dependencies = [ - "cfg-if", - "crunchy", - "num-traits", -] - -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" -dependencies = [ - "ahash", - "allocator-api2", -] - -[[package]] -name = "hashbrown" -version = "0.15.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash", -] - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "http" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "humantime" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" - -[[package]] -name = "iana-time-zone" -version = "0.1.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" -dependencies = [ - "android_system_properties", - "core-foundation-sys", - "iana-time-zone-haiku", - "js-sys", - "log", - "wasm-bindgen", - "windows-core", -] - -[[package]] -name = "iana-time-zone-haiku" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" -dependencies = [ - "cc", -] - -[[package]] -name = "icu_collections" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" -dependencies = [ - "displaydoc", - "potential_utf", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_locale_core" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" -dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - -[[package]] -name = "icu_normalizer" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "smallvec", - "zerovec", -] - -[[package]] -name = "icu_normalizer_data" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" - -[[package]] -name = "icu_properties" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locale_core", - "icu_properties_data", - "icu_provider", - "potential_utf", - "zerotrie", - "zerovec", -] - -[[package]] -name = "icu_properties_data" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" - -[[package]] -name = "icu_provider" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" -dependencies = [ - "displaydoc", - "icu_locale_core", - "stable_deref_trait", - "tinystr", - "writeable", - "yoke", - "zerofrom", - "zerotrie", - "zerovec", -] - -[[package]] -name = "idna" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" -dependencies = [ - "idna_adapter", - "smallvec", - "utf8_iter", -] - -[[package]] -name = "idna_adapter" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" -dependencies = [ - "icu_normalizer", - "icu_properties", -] - -[[package]] -name = "indexmap" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2481980430f9f78649238835720ddccc57e52df14ffce1c6f37391d61b563e9" -dependencies = [ - "equivalent", - "hashbrown 0.15.5", -] - -[[package]] -name = "indoc" -version = "2.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" - -[[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - -[[package]] -name = "io-uring" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" -dependencies = [ - "bitflags", - "cfg-if", - "libc", -] - -[[package]] -name = "itertools" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" - -[[package]] -name = "jobserver" -version = "0.1.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" -dependencies = [ - "getrandom 0.3.3", - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.77" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" -dependencies = [ - "once_cell", - "wasm-bindgen", -] - -[[package]] -name = "lexical-core" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" -dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - -[[package]] -name = "lexical-parse-float" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" -dependencies = [ - "lexical-parse-integer", - "lexical-util", - "static_assertions", -] - -[[package]] -name = "lexical-parse-integer" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" -dependencies = [ - "lexical-util", - "static_assertions", -] - -[[package]] -name = "lexical-util" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" -dependencies = [ - "static_assertions", -] - -[[package]] -name = "lexical-write-float" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" -dependencies = [ - "lexical-util", - "lexical-write-integer", - "static_assertions", -] - -[[package]] -name = "lexical-write-integer" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" -dependencies = [ - "lexical-util", - "static_assertions", -] - -[[package]] -name = "libbz2-rs-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" - -[[package]] -name = "libc" -version = "0.2.175" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" - -[[package]] -name = "libloading" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" -dependencies = [ - "cfg-if", - "winapi", -] - -[[package]] -name = "libm" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" - -[[package]] -name = "libz-rs-sys" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "172a788537a2221661b480fee8dc5f96c580eb34fa88764d3205dc356c7e4221" -dependencies = [ - "zlib-rs", -] - -[[package]] -name = "linux-raw-sys" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" - -[[package]] -name = "litemap" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" - -[[package]] -name = "lock_api" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" -dependencies = [ - "autocfg", - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" - -[[package]] -name = "lz4_flex" -version = "0.11.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" -dependencies = [ - "twox-hash", -] - -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "md-5" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" -dependencies = [ - "cfg-if", - "digest", -] - -[[package]] -name = "memchr" -version = "2.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" - -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - -[[package]] -name = "miniz_oxide" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" -dependencies = [ - "adler2", -] - -[[package]] -name = "mio" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" -dependencies = [ - "libc", - "wasi 0.11.1+wasi-snapshot-preview1", - "windows-sys 0.59.0", -] - -[[package]] -name = "num" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] - -[[package]] -name = "num-bigint" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" -dependencies = [ - "num-integer", - "num-traits", -] - -[[package]] -name = "num-complex" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-iter" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" -dependencies = [ - "num-bigint", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" -dependencies = [ - "autocfg", - "libm", -] - -[[package]] -name = "object" -version = "0.36.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" -dependencies = [ - "memchr", -] - -[[package]] -name = "object_store" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efc4f07659e11cd45a341cd24d71e683e3be65d9ff1f8150061678fe60437496" -dependencies = [ - "async-trait", - "bytes", - "chrono", - "futures", - "http", - "humantime", - "itertools", - "parking_lot", - "percent-encoding", - "thiserror", - "tokio", - "tracing", - "url", - "walkdir", - "wasm-bindgen-futures", - "web-time", -] - -[[package]] -name = "once_cell" -version = "1.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" - -[[package]] -name = "ordered-float" -version = "2.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" -dependencies = [ - "num-traits", -] - -[[package]] -name = "parking_lot" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets 0.52.6", -] - -[[package]] -name = "parquet" -version = "55.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "futures", - "half", - "hashbrown 0.15.5", - "lz4_flex", - "num", - "num-bigint", - "object_store", - "paste", - "ring", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "tokio", - "twox-hash", - "zstd", -] - -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - -[[package]] -name = "percent-encoding" -version = "2.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" - -[[package]] -name = "petgraph" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" -dependencies = [ - "fixedbitset", - "hashbrown 0.15.5", - "indexmap", - "serde", -] - -[[package]] -name = "phf" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_shared" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkg-config" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" - -[[package]] -name = "portable-atomic" -version = "1.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" - -[[package]] -name = "potential_utf" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" -dependencies = [ - "zerovec", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - -[[package]] -name = "proc-macro2" -version = "1.0.101" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "prost" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" -dependencies = [ - "bytes", - "prost-derive", -] - -[[package]] -name = "prost-derive" -version = "0.13.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" -dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "psm" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e944464ec8536cd1beb0bbfd96987eb5e3b72f2ecdafdc5c769a37f1fa2ae1f" -dependencies = [ - "cc", -] - -[[package]] -name = "pyo3" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" -dependencies = [ - "cfg-if", - "indoc", - "libc", - "memoffset", - "once_cell", - "portable-atomic", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.23.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" -dependencies = [ - "heck", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "quote" -version = "1.0.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "r-efi" -version = "5.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" - -[[package]] -name = "rand" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" -dependencies = [ - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" -dependencies = [ - "getrandom 0.3.3", -] - -[[package]] -name = "recursive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" -dependencies = [ - "recursive-proc-macro-impl", - "stacker", -] - -[[package]] -name = "recursive-proc-macro-impl" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" -dependencies = [ - "quote", - "syn 2.0.106", -] - -[[package]] -name = "redox_syscall" -version = "0.5.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-syntax" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" - -[[package]] -name = "repr_offset" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" -dependencies = [ - "tstr", -] - -[[package]] -name = "ring" -version = "0.17.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" -dependencies = [ - "cc", - "cfg-if", - "getrandom 0.2.16", - "libc", - "untrusted", - "windows-sys 0.52.0", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" - -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - -[[package]] -name = "rustix" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.60.2", -] - -[[package]] -name = "rustversion" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" - -[[package]] -name = "ryu" -version = "1.0.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "semver" -version = "1.0.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" - -[[package]] -name = "seq-macro" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" - -[[package]] -name = "serde" -version = "1.0.219" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.219" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "serde_json" -version = "1.0.143" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" -dependencies = [ - "itoa", - "memchr", - "ryu", - "serde", -] - -[[package]] -name = "sha2" -version = "0.10.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "shlex" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" - -[[package]] -name = "simdutf8" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" - -[[package]] -name = "siphasher" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" - -[[package]] -name = "slab" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" - -[[package]] -name = "smallvec" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" - -[[package]] -name = "snap" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" - -[[package]] -name = "sqlparser" -version = "0.55.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" -dependencies = [ - "log", - "recursive", - "sqlparser_derive", -] - -[[package]] -name = "sqlparser_derive" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "stacker" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddb07e32ddb770749da91081d8d0ac3a16f1a569a18b20348cd371f5dead06b" -dependencies = [ - "cc", - "cfg-if", - "libc", - "psm", - "windows-sys 0.59.0", -] - -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - -[[package]] -name = "subtle" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.106" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "synstructure" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "target-lexicon" -version = "0.12.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" - -[[package]] -name = "tempfile" -version = "3.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" -dependencies = [ - "fastrand", - "getrandom 0.3.3", - "once_cell", - "rustix", - "windows-sys 0.60.2", -] - -[[package]] -name = "thiserror" -version = "2.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "2.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "thrift" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" -dependencies = [ - "byteorder", - "integer-encoding", - "ordered-float", -] - -[[package]] -name = "tiny-keccak" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - -[[package]] -name = "tinystr" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" -dependencies = [ - "displaydoc", - "zerovec", -] - -[[package]] -name = "tokio" -version = "1.47.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" -dependencies = [ - "backtrace", - "bytes", - "io-uring", - "libc", - "mio", - "pin-project-lite", - "slab", - "tokio-macros", -] - -[[package]] -name = "tokio-macros" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "tokio-util" -version = "0.7.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "tracing" -version = "0.1.41" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" -dependencies = [ - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "tracing-core" -version = "0.1.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" -dependencies = [ - "once_cell", -] - -[[package]] -name = "tstr" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" -dependencies = [ - "tstr_proc_macros", -] - -[[package]] -name = "tstr_proc_macros" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" - -[[package]] -name = "twox-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56" - -[[package]] -name = "typed-arena" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" - -[[package]] -name = "typenum" -version = "1.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" - -[[package]] -name = "typewit" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dd91acc53c592cb800c11c83e8e7ee1d48378d05cfa33b5474f5f80c5b236bf" - -[[package]] -name = "unicode-ident" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" - -[[package]] -name = "unicode-segmentation" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" - -[[package]] -name = "unicode-width" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" - -[[package]] -name = "unindent" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - -[[package]] -name = "untrusted" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" - -[[package]] -name = "url" -version = "2.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", - "serde", -] - -[[package]] -name = "utf8_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" - -[[package]] -name = "uuid" -version = "1.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f33196643e165781c20a5ead5582283a7dacbb87855d867fbc2df3f81eddc1be" -dependencies = [ - "getrandom 0.3.3", - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - -[[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" - -[[package]] -name = "wasi" -version = "0.14.2+wasi-0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" -dependencies = [ - "wit-bindgen-rt", -] - -[[package]] -name = "wasm-bindgen" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" -dependencies = [ - "cfg-if", - "once_cell", - "rustversion", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" -dependencies = [ - "bumpalo", - "log", - "proc-macro2", - "quote", - "syn 2.0.106", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.50" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" -dependencies = [ - "cfg-if", - "js-sys", - "once_cell", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "web-sys" -version = "0.3.77" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "web-time" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22" -dependencies = [ - "windows-sys 0.60.2", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows-core" -version = "0.61.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" -dependencies = [ - "windows-implement", - "windows-interface", - "windows-link", - "windows-result", - "windows-strings", -] - -[[package]] -name = "windows-implement" -version = "0.60.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "windows-interface" -version = "0.59.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "windows-link" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" - -[[package]] -name = "windows-result" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-strings" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" -dependencies = [ - "windows-link", -] - -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-sys" -version = "0.59.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" -dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-sys" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" -dependencies = [ - "windows-targets 0.53.3", -] - -[[package]] -name = "windows-targets" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" -dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", -] - -[[package]] -name = "windows-targets" -version = "0.53.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" -dependencies = [ - "windows-link", - "windows_aarch64_gnullvm 0.53.0", - "windows_aarch64_msvc 0.53.0", - "windows_i686_gnu 0.53.0", - "windows_i686_gnullvm 0.53.0", - "windows_i686_msvc 0.53.0", - "windows_x86_64_gnu 0.53.0", - "windows_x86_64_gnullvm 0.53.0", - "windows_x86_64_msvc 0.53.0", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" - -[[package]] -name = "windows_i686_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" - -[[package]] -name = "windows_i686_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" - -[[package]] -name = "wit-bindgen-rt" -version = "0.39.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" -dependencies = [ - "bitflags", -] - -[[package]] -name = "writeable" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" - -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - -[[package]] -name = "yoke" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" -dependencies = [ - "serde", - "stable_deref_trait", - "yoke-derive", - "zerofrom", -] - -[[package]] -name = "yoke-derive" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", - "synstructure", -] - -[[package]] -name = "zerocopy" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "zerofrom" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" -dependencies = [ - "zerofrom-derive", -] - -[[package]] -name = "zerofrom-derive" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", - "synstructure", -] - -[[package]] -name = "zerotrie" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" -dependencies = [ - "displaydoc", - "yoke", - "zerofrom", -] - -[[package]] -name = "zerovec" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" -dependencies = [ - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "zlib-rs" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" - -[[package]] -name = "zstd" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "7.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" -dependencies = [ - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.15+zstd.1.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" -dependencies = [ - "cc", - "pkg-config", -] diff --git a/examples/datafusion-ffi-example/Cargo.toml b/examples/datafusion-ffi-example/Cargo.toml index 647f6c51e..178dce9f9 100644 --- a/examples/datafusion-ffi-example/Cargo.toml +++ b/examples/datafusion-ffi-example/Cargo.toml @@ -17,20 +17,36 @@ [package] name = "datafusion-ffi-example" -version = "0.2.0" -edition = "2021" +version.workspace = true +edition.workspace = true +license.workspace = true +description.workspace = true +homepage.workspace = true +repository.workspace = true +publish = false [dependencies] -datafusion = { version = "49.0.2" } -datafusion-ffi = { version = "49.0.2" } -pyo3 = { version = "0.23", features = ["extension-module", "abi3", "abi3-py39"] } -arrow = { version = "55.0.0" } -arrow-array = { version = "55.0.0" } -arrow-schema = { version = "55.0.0" } -async-trait = "0.1.88" +datafusion-catalog = { workspace = true, default-features = false } +datafusion-common = { workspace = true, default-features = false } +datafusion-functions-aggregate = { workspace = true } +datafusion-functions-window = { workspace = true } +datafusion-expr = { workspace = true } +datafusion-ffi = { workspace = true } + +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +async-trait = { workspace = true } +datafusion-python-util.workspace = true +pyo3 = { workspace = true, features = [ + "extension-module", + "abi3", + "abi3-py310", +] } +pyo3-log = { workspace = true } [build-dependencies] -pyo3-build-config = "0.23" +pyo3-build-config = { workspace = true } [lib] name = "datafusion_ffi_example" diff --git a/examples/datafusion-ffi-example/pyproject.toml b/examples/datafusion-ffi-example/pyproject.toml index 0c54df95c..7f85e9487 100644 --- a/examples/datafusion-ffi-example/pyproject.toml +++ b/examples/datafusion-ffi-example/pyproject.toml @@ -23,9 +23,9 @@ build-backend = "maturin" name = "datafusion_ffi_example" requires-python = ">=3.9" classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ] dynamic = ["version"] diff --git a/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py index 1bf1bf136..a862b23ba 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py +++ b/examples/datafusion-ffi-example/python/tests/_test_catalog_provider.py @@ -18,43 +18,119 @@ from __future__ import annotations import pyarrow as pa -from datafusion import SessionContext -from datafusion_ffi_example import MyCatalogProvider +import pyarrow.dataset as ds +import pytest +from datafusion import SessionContext, Table +from datafusion.catalog import Schema +from datafusion_ffi_example import MyCatalogProvider, MyCatalogProviderList -def test_catalog_provider(): +def create_test_dataset() -> Table: + """Create a simple test dataset.""" + batch = pa.RecordBatch.from_arrays( + [pa.array([100, 200, 300]), pa.array([1.1, 2.2, 3.3])], + names=["id", "value"], + ) + dataset = ds.dataset([batch]) + return Table(dataset) + + +@pytest.mark.parametrize("inner_capsule", [True, False]) +def test_ffi_catalog_provider_list(inner_capsule: bool) -> None: + """Test basic FFI CatalogProviderList functionality.""" ctx = SessionContext() - my_catalog_name = "my_catalog" - expected_schema_name = "my_schema" - expected_table_name = "my_table" - expected_table_columns = ["units", "price"] + # Register FFI catalog + catalog_provider_list = MyCatalogProviderList() + if inner_capsule: + catalog_provider_list = ( + catalog_provider_list.__datafusion_catalog_provider_list__(ctx) + ) + + ctx.register_catalog_provider_list(catalog_provider_list) + + # Verify the catalog exists + catalog = ctx.catalog("auto_ffi_catalog") + schema_names = catalog.names() + assert "my_schema" in schema_names + + ctx.register_catalog_provider("second", MyCatalogProvider()) + + assert ctx.catalog_names() == {"auto_ffi_catalog", "second"} + + +@pytest.mark.parametrize("inner_capsule", [True, False]) +def test_ffi_catalog_provider_basic(inner_capsule: bool) -> None: + """Test basic FFI CatalogProvider functionality.""" + ctx = SessionContext() + # Register FFI catalog catalog_provider = MyCatalogProvider() - ctx.register_catalog_provider(my_catalog_name, catalog_provider) - my_catalog = ctx.catalog(my_catalog_name) - - my_catalog_schemas = my_catalog.names() - assert expected_schema_name in my_catalog_schemas - my_schema = my_catalog.schema(expected_schema_name) - assert expected_table_name in my_schema.names() - my_table = my_schema.table(expected_table_name) - assert expected_table_columns == my_table.schema.names - - result = ctx.table( - f"{my_catalog_name}.{expected_schema_name}.{expected_table_name}" - ).collect() + if inner_capsule: + catalog_provider = catalog_provider.__datafusion_catalog_provider__(ctx) + + ctx.register_catalog_provider("ffi_catalog", catalog_provider) + + # Verify the catalog exists + catalog = ctx.catalog("ffi_catalog") + schema_names = catalog.names() + assert "my_schema" in schema_names + + # Query the pre-populated table + result = ctx.sql("SELECT * FROM ffi_catalog.my_schema.my_table").collect() assert len(result) == 2 + assert result[0].num_columns == 2 + + +def test_ffi_catalog_provider_register_schema(): + """Test registering additional schemas to FFI CatalogProvider.""" + ctx = SessionContext() + + catalog_provider = MyCatalogProvider() + ctx.register_catalog_provider("ffi_catalog", catalog_provider) + + catalog = ctx.catalog("ffi_catalog") + + # Register a new memory schema + new_schema = Schema.memory_schema() + catalog.register_schema("additional_schema", new_schema) + + # Verify the schema was registered + assert "additional_schema" in catalog.names() + + # Add a table to the new schema + new_schema.register_table("new_table", create_test_dataset()) + + # Query the new table + result = ctx.sql("SELECT * FROM ffi_catalog.additional_schema.new_table").collect() + assert len(result) == 1 + assert result[0].column(0) == pa.array([100, 200, 300]) + + +def test_ffi_catalog_provider_deregister_schema(): + """Test deregistering schemas from FFI CatalogProvider.""" + ctx = SessionContext() + + catalog_provider = MyCatalogProvider() + ctx.register_catalog_provider("ffi_catalog", catalog_provider) + + catalog = ctx.catalog("ffi_catalog") + + # Register two schemas + schema1 = Schema.memory_schema() + schema2 = Schema.memory_schema() + catalog.register_schema("temp_schema1", schema1) + catalog.register_schema("temp_schema2", schema2) + + # Verify both exist + names = catalog.names() + assert "temp_schema1" in names + assert "temp_schema2" in names + + # Deregister one schema + catalog.deregister_schema("temp_schema1") - col0_result = [r.column(0) for r in result] - col1_result = [r.column(1) for r in result] - expected_col0 = [ - pa.array([10, 20, 30], type=pa.int32()), - pa.array([5, 7], type=pa.int32()), - ] - expected_col1 = [ - pa.array([1, 2, 5], type=pa.float64()), - pa.array([1.5, 2.5], type=pa.float64()), - ] - assert col0_result == expected_col0 - assert col1_result == expected_col1 + # Verify it's gone + names = catalog.names() + assert "temp_schema1" not in names + assert "temp_schema2" in names diff --git a/examples/datafusion-ffi-example/python/tests/_test_schema_provider.py b/examples/datafusion-ffi-example/python/tests/_test_schema_provider.py new file mode 100644 index 000000000..93449c660 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_schema_provider.py @@ -0,0 +1,232 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import pyarrow as pa +import pyarrow.dataset as ds +import pytest +from datafusion import SessionContext, Table +from datafusion.catalog import Schema +from datafusion_ffi_example import FixedSchemaProvider, MyCatalogProvider + + +def create_test_dataset() -> Table: + """Create a simple test dataset.""" + batch = pa.RecordBatch.from_arrays( + [pa.array([100, 200, 300]), pa.array([1.1, 2.2, 3.3])], + names=["id", "value"], + ) + dataset = ds.dataset([batch]) + return Table(dataset) + + +@pytest.mark.parametrize("inner_capsule", [True, False]) +def test_schema_provider_extract_values(inner_capsule: bool) -> None: + ctx = SessionContext() + + my_schema_name = "my_schema" + + schema_provider = FixedSchemaProvider() + if inner_capsule: + schema_provider = schema_provider.__datafusion_schema_provider__(ctx) + + ctx.catalog().register_schema(my_schema_name, schema_provider) + + expected_schema_name = "my_schema" + expected_table_name = "my_table" + expected_table_columns = ["units", "price"] + + default_catalog = ctx.catalog() + + catalog_schemas = default_catalog.names() + assert expected_schema_name in catalog_schemas + my_schema = default_catalog.schema(expected_schema_name) + assert expected_table_name in my_schema.names() + my_table = my_schema.table(expected_table_name) + assert expected_table_columns == my_table.schema.names + + result = ctx.table(f"{expected_schema_name}.{expected_table_name}").collect() + assert len(result) == 2 + + col0_result = [r.column(0) for r in result] + col1_result = [r.column(1) for r in result] + expected_col0 = [ + pa.array([10, 20, 30], type=pa.int32()), + pa.array([5, 7], type=pa.int32()), + ] + expected_col1 = [ + pa.array([1, 2, 5], type=pa.float64()), + pa.array([1.5, 2.5], type=pa.float64()), + ] + assert col0_result == expected_col0 + assert col1_result == expected_col1 + + +def test_ffi_schema_provider_basic(): + """Test basic FFI SchemaProvider functionality.""" + ctx = SessionContext() + + # Register FFI schema + schema_provider = FixedSchemaProvider() + ctx.catalog().register_schema("ffi_schema", schema_provider) + + # Verify the schema exists + schema = ctx.catalog().schema("ffi_schema") + table_names = schema.names() + assert "my_table" in table_names + + # Query the pre-populated table + result = ctx.sql("SELECT * FROM ffi_schema.my_table").collect() + assert len(result) == 2 + assert result[0].num_columns == 2 + + +def test_ffi_schema_provider_register_table(): + """Test registering additional tables to FFI SchemaProvider.""" + ctx = SessionContext() + + schema_provider = FixedSchemaProvider() + ctx.catalog().register_schema("ffi_schema", schema_provider) + + schema = ctx.catalog().schema("ffi_schema") + + # Register a new table + schema.register_table("additional_table", create_test_dataset()) + + # Verify the table was registered + assert "additional_table" in schema.names() + + # Query the new table + result = ctx.sql("SELECT * FROM ffi_schema.additional_table").collect() + assert len(result) == 1 + assert result[0].column(0) == pa.array([100, 200, 300]) + assert result[0].column(1) == pa.array([1.1, 2.2, 3.3]) + + +def test_ffi_schema_provider_deregister_table(): + """Test deregistering tables from FFI SchemaProvider.""" + ctx = SessionContext() + + schema_provider = FixedSchemaProvider() + ctx.catalog().register_schema("ffi_schema", schema_provider) + + schema = ctx.catalog().schema("ffi_schema") + + # Register two tables + schema.register_table("temp_table1", create_test_dataset()) + schema.register_table("temp_table2", create_test_dataset()) + + # Verify both exist + names = schema.names() + assert "temp_table1" in names + assert "temp_table2" in names + + # Deregister one table + schema.deregister_table("temp_table1") + + # Verify it's gone + names = schema.names() + assert "temp_table1" not in names + assert "temp_table2" in names + + +def test_mixed_ffi_and_python_providers(): + """Test mixing FFI and Python providers in the same catalog/schema.""" + ctx = SessionContext() + + # Register FFI catalog + ffi_catalog = MyCatalogProvider() + ctx.register_catalog_provider("ffi_catalog", ffi_catalog) + + # Register Python memory schema to FFI catalog + python_schema = Schema.memory_schema() + ctx.catalog("ffi_catalog").register_schema("python_schema", python_schema) + + # Add table to Python schema + python_schema.register_table("python_table", create_test_dataset()) + + # Query both FFI table and Python table + result_ffi = ctx.sql("SELECT * FROM ffi_catalog.my_schema.my_table").collect() + assert len(result_ffi) == 2 + + result_python = ctx.sql( + "SELECT * FROM ffi_catalog.python_schema.python_table" + ).collect() + assert len(result_python) == 1 + assert result_python[0].column(0) == pa.array([100, 200, 300]) + + +def test_ffi_catalog_with_multiple_schemas(): + """Test FFI catalog with multiple schemas of different types.""" + ctx = SessionContext() + + catalog_provider = MyCatalogProvider() + ctx.register_catalog_provider("multi_catalog", catalog_provider) + + catalog = ctx.catalog("multi_catalog") + + # Register different types of schemas + ffi_schema = FixedSchemaProvider() + memory_schema = Schema.memory_schema() + + catalog.register_schema("ffi_schema", ffi_schema) + catalog.register_schema("memory_schema", memory_schema) + + # Add tables to memory schema + memory_schema.register_table("mem_table", create_test_dataset()) + + # Verify all schemas exist + names = catalog.names() + assert "my_schema" in names # Pre-populated + assert "ffi_schema" in names + assert "memory_schema" in names + + # Query tables from each schema + result = ctx.sql("SELECT * FROM multi_catalog.my_schema.my_table").collect() + assert len(result) == 2 + + result = ctx.sql("SELECT * FROM multi_catalog.ffi_schema.my_table").collect() + assert len(result) == 2 + + result = ctx.sql("SELECT * FROM multi_catalog.memory_schema.mem_table").collect() + assert len(result) == 1 + assert result[0].column(0) == pa.array([100, 200, 300]) + + +def test_ffi_schema_table_exist(): + """Test table_exist method on FFI SchemaProvider.""" + ctx = SessionContext() + + schema_provider = FixedSchemaProvider() + ctx.catalog().register_schema("ffi_schema", schema_provider) + + schema = ctx.catalog().schema("ffi_schema") + + # Check pre-populated table + assert schema.table_exist("my_table") + + # Check non-existent table + assert not schema.table_exist("nonexistent_table") + + # Register a new table and check + schema.register_table("new_table", create_test_dataset()) + assert schema.table_exist("new_table") + + # Deregister and check + schema.deregister_table("new_table") + assert not schema.table_exist("new_table") diff --git a/examples/datafusion-ffi-example/python/tests/_test_table_function.py b/examples/datafusion-ffi-example/python/tests/_test_table_function.py index 4b8b21454..bf5aae3bd 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_table_function.py +++ b/examples/datafusion-ffi-example/python/tests/_test_table_function.py @@ -27,9 +27,10 @@ from datafusion.context import TableProviderExportable -def test_ffi_table_function_register(): +def test_ffi_table_function_register() -> None: ctx = SessionContext() table_func = MyTableFunction() + table_udtf = udtf(table_func, "my_table_func") ctx.register_udtf(table_udtf) result = ctx.sql("select * from my_table_func()").collect() diff --git a/examples/datafusion-ffi-example/python/tests/_test_table_provider.py b/examples/datafusion-ffi-example/python/tests/_test_table_provider.py index 48feaff64..fc77d2d3b 100644 --- a/examples/datafusion-ffi-example/python/tests/_test_table_provider.py +++ b/examples/datafusion-ffi-example/python/tests/_test_table_provider.py @@ -18,13 +18,18 @@ from __future__ import annotations import pyarrow as pa +import pytest from datafusion import SessionContext from datafusion_ffi_example import MyTableProvider -def test_table_loading(): +@pytest.mark.parametrize("inner_capsule", [True, False]) +def test_table_provider_ffi(inner_capsule: bool) -> None: ctx = SessionContext() table = MyTableProvider(3, 2, 4) + if inner_capsule: + table = table.__datafusion_table_provider__(ctx) + ctx.register_table("t", table) result = ctx.table("t").collect() diff --git a/examples/datafusion-ffi-example/python/tests/_test_table_provider_factory.py b/examples/datafusion-ffi-example/python/tests/_test_table_provider_factory.py new file mode 100644 index 000000000..b1e94ec73 --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/_test_table_provider_factory.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from datafusion import SessionContext +from datafusion_ffi_example import MyTableProviderFactory + + +def test_table_provider_factory_ffi() -> None: + ctx = SessionContext() + table = MyTableProviderFactory() + + ctx.register_table_factory("MY_FORMAT", table) + + # Create a new external table + ctx.sql(""" + CREATE EXTERNAL TABLE + foo + STORED AS my_format + LOCATION ''; + """).collect() + + # Query the pre-populated table + result = ctx.sql("SELECT * FROM foo;").collect() + assert len(result) == 2 + assert result[0].num_columns == 2 diff --git a/examples/datafusion-ffi-example/python/tests/conftest.py b/examples/datafusion-ffi-example/python/tests/conftest.py new file mode 100644 index 000000000..68f8057af --- /dev/null +++ b/examples/datafusion-ffi-example/python/tests/conftest.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import pytest + +if TYPE_CHECKING: + from collections.abc import Generator + from typing import Any + + +class _FailOnWarning(logging.Handler): + def emit(self, record: logging.LogRecord) -> None: + if record.levelno >= logging.WARNING: + err = f"Unexpected log warning from '{record.name}': {self.format(record)}" + raise AssertionError(err) + + +@pytest.fixture(autouse=True) +def fail_on_log_warnings() -> Generator[None, Any, None]: + handler = _FailOnWarning() + logging.root.addHandler(handler) + yield + logging.root.removeHandler(handler) diff --git a/examples/datafusion-ffi-example/src/aggregate_udf.rs b/examples/datafusion-ffi-example/src/aggregate_udf.rs index 9481fe9c6..d5343ff91 100644 --- a/examples/datafusion-ffi-example/src/aggregate_udf.rs +++ b/examples/datafusion-ffi-example/src/aggregate_udf.rs @@ -15,19 +15,25 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; +use std::sync::Arc; + use arrow_schema::DataType; -use datafusion::error::Result as DataFusionResult; -use datafusion::functions_aggregate::sum::Sum; -use datafusion::logical_expr::function::AccumulatorArgs; -use datafusion::logical_expr::{Accumulator, AggregateUDF, AggregateUDFImpl, Signature}; +use datafusion_common::error::Result as DataFusionResult; +use datafusion_expr::function::AccumulatorArgs; +use datafusion_expr::{Accumulator, AggregateUDF, AggregateUDFImpl, Signature}; use datafusion_ffi::udaf::FFI_AggregateUDF; +use datafusion_functions_aggregate::sum::Sum; use pyo3::types::PyCapsule; -use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; -use std::any::Any; -use std::sync::Arc; +use pyo3::{Bound, PyResult, Python, pyclass, pymethods}; -#[pyclass(name = "MySumUDF", module = "datafusion_ffi_example", subclass)] -#[derive(Debug, Clone)] +#[pyclass( + from_py_object, + name = "MySumUDF", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug, Clone, Eq, PartialEq, Hash)] pub(crate) struct MySumUDF { inner: Arc, } @@ -35,10 +41,10 @@ pub(crate) struct MySumUDF { #[pymethods] impl MySumUDF { #[new] - fn new() -> Self { - Self { + fn new() -> PyResult { + Ok(Self { inner: Arc::new(Sum::new()), - } + }) } fn __datafusion_aggregate_udf__<'py>( diff --git a/examples/datafusion-ffi-example/src/catalog_provider.rs b/examples/datafusion-ffi-example/src/catalog_provider.rs index cd2616916..bd5da1e4d 100644 --- a/examples/datafusion-ffi-example/src/catalog_provider.rs +++ b/examples/datafusion-ffi-example/src/catalog_provider.rs @@ -15,24 +15,27 @@ // specific language governing permissions and limitations // under the License. -use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; -use std::{any::Any, fmt::Debug, sync::Arc}; +use std::any::Any; +use std::fmt::Debug; +use std::sync::Arc; use arrow::datatypes::Schema; use async_trait::async_trait; -use datafusion::{ - catalog::{ - CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, TableProvider, - }, - datasource::MemTable, - error::{DataFusionError, Result}, +use datafusion_catalog::{ + CatalogProvider, CatalogProviderList, MemTable, MemoryCatalogProvider, + MemoryCatalogProviderList, MemorySchemaProvider, SchemaProvider, TableProvider, }; +use datafusion_common::error::{DataFusionError, Result}; use datafusion_ffi::catalog_provider::FFI_CatalogProvider; +use datafusion_ffi::catalog_provider_list::FFI_CatalogProviderList; +use datafusion_ffi::schema_provider::FFI_SchemaProvider; +use datafusion_python_util::ffi_logical_codec_from_pycapsule; use pyo3::types::PyCapsule; +use pyo3::{Bound, PyAny, PyResult, Python, pyclass, pymethods}; pub fn my_table() -> Arc { use arrow::datatypes::{DataType, Field}; - use datafusion::common::record_batch; + use datafusion_common::record_batch; let schema = Arc::new(Schema::new(vec![ Field::new("units", DataType::Int32, true), @@ -55,14 +58,20 @@ pub fn my_table() -> Arc { Arc::new(MemTable::try_new(schema, vec![partitions]).unwrap()) } +#[pyclass( + skip_from_py_object, + name = "FixedSchemaProvider", + module = "datafusion_ffi_example", + subclass +)] #[derive(Debug)] pub struct FixedSchemaProvider { - inner: MemorySchemaProvider, + inner: Arc, } impl Default for FixedSchemaProvider { fn default() -> Self { - let inner = MemorySchemaProvider::new(); + let inner = Arc::new(MemorySchemaProvider::new()); let table = my_table(); @@ -72,6 +81,29 @@ impl Default for FixedSchemaProvider { } } +#[pymethods] +impl FixedSchemaProvider { + #[new] + pub fn new() -> Self { + Self::default() + } + + pub fn __datafusion_schema_provider__<'py>( + &self, + py: Python<'py>, + session: Bound, + ) -> PyResult> { + let name = cr"datafusion_schema_provider".into(); + + let provider = Arc::clone(&self.inner) as Arc; + + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider = FFI_SchemaProvider::new_with_ffi_codec(provider, None, codec); + + PyCapsule::new(py, provider, Some(name)) + } +} + #[async_trait] impl SchemaProvider for FixedSchemaProvider { fn as_any(&self) -> &dyn Any { @@ -106,24 +138,14 @@ impl SchemaProvider for FixedSchemaProvider { /// This catalog provider is intended only for unit tests. It prepopulates with one /// schema and only allows for schemas named after four types of fruit. #[pyclass( + skip_from_py_object, name = "MyCatalogProvider", module = "datafusion_ffi_example", subclass )] -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct MyCatalogProvider { - inner: MemoryCatalogProvider, -} - -impl Default for MyCatalogProvider { - fn default() -> Self { - let inner = MemoryCatalogProvider::new(); - - let schema_name: &str = "my_schema"; - let _ = inner.register_schema(schema_name, Arc::new(FixedSchemaProvider::default())); - - Self { inner } - } + inner: Arc, } impl CatalogProvider for MyCatalogProvider { @@ -159,20 +181,92 @@ impl CatalogProvider for MyCatalogProvider { #[pymethods] impl MyCatalogProvider { #[new] - pub fn new() -> Self { - Self { - inner: Default::default(), - } + pub fn new() -> PyResult { + let inner = Arc::new(MemoryCatalogProvider::new()); + + let schema_name: &str = "my_schema"; + let _ = inner.register_schema(schema_name, Arc::new(FixedSchemaProvider::default())); + + Ok(Self { inner }) } pub fn __datafusion_catalog_provider__<'py>( &self, py: Python<'py>, + session: Bound, ) -> PyResult> { let name = cr"datafusion_catalog_provider".into(); - let catalog_provider = - FFI_CatalogProvider::new(Arc::new(MyCatalogProvider::default()), None); - PyCapsule::new(py, catalog_provider, Some(name)) + let provider = Arc::clone(&self.inner) as Arc; + + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider = FFI_CatalogProvider::new_with_ffi_codec(provider, None, codec); + + PyCapsule::new(py, provider, Some(name)) + } +} + +/// This catalog provider list is intended only for unit tests. +/// It pre-populates with a single catalog. +#[pyclass( + skip_from_py_object, + name = "MyCatalogProviderList", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug, Clone)] +pub(crate) struct MyCatalogProviderList { + inner: Arc, +} + +impl CatalogProviderList for MyCatalogProviderList { + fn as_any(&self) -> &dyn Any { + self + } + + fn catalog_names(&self) -> Vec { + self.inner.catalog_names() + } + + fn catalog(&self, name: &str) -> Option> { + self.inner.catalog(name) + } + + fn register_catalog( + &self, + name: String, + catalog: Arc, + ) -> Option> { + self.inner.register_catalog(name, catalog) + } +} + +#[pymethods] +impl MyCatalogProviderList { + #[new] + pub fn new() -> PyResult { + let inner = Arc::new(MemoryCatalogProviderList::new()); + + inner.register_catalog( + "auto_ffi_catalog".to_owned(), + Arc::new(MyCatalogProvider::new()?), + ); + + Ok(Self { inner }) + } + + pub fn __datafusion_catalog_provider_list__<'py>( + &self, + py: Python<'py>, + session: Bound, + ) -> PyResult> { + let name = cr"datafusion_catalog_provider_list".into(); + + let provider = Arc::clone(&self.inner) as Arc; + + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider = FFI_CatalogProviderList::new_with_ffi_codec(provider, None, codec); + + PyCapsule::new(py, provider, Some(name)) } } diff --git a/examples/datafusion-ffi-example/src/lib.rs b/examples/datafusion-ffi-example/src/lib.rs index f5f96cd49..68120a4cd 100644 --- a/examples/datafusion-ffi-example/src/lib.rs +++ b/examples/datafusion-ffi-example/src/lib.rs @@ -15,26 +15,34 @@ // specific language governing permissions and limitations // under the License. +use pyo3::prelude::*; + use crate::aggregate_udf::MySumUDF; -use crate::catalog_provider::MyCatalogProvider; +use crate::catalog_provider::{FixedSchemaProvider, MyCatalogProvider, MyCatalogProviderList}; use crate::scalar_udf::IsNullUDF; use crate::table_function::MyTableFunction; use crate::table_provider::MyTableProvider; +use crate::table_provider_factory::MyTableProviderFactory; use crate::window_udf::MyRankUDF; -use pyo3::prelude::*; pub(crate) mod aggregate_udf; pub(crate) mod catalog_provider; pub(crate) mod scalar_udf; pub(crate) mod table_function; pub(crate) mod table_provider; +pub(crate) mod table_provider_factory; pub(crate) mod window_udf; #[pymodule] fn datafusion_ffi_example(m: &Bound<'_, PyModule>) -> PyResult<()> { + pyo3_log::init(); + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/examples/datafusion-ffi-example/src/scalar_udf.rs b/examples/datafusion-ffi-example/src/scalar_udf.rs index 727666638..374924781 100644 --- a/examples/datafusion-ffi-example/src/scalar_udf.rs +++ b/examples/datafusion-ffi-example/src/scalar_udf.rs @@ -15,22 +15,28 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; +use std::sync::Arc; + use arrow_array::{Array, BooleanArray}; use arrow_schema::DataType; -use datafusion::common::ScalarValue; -use datafusion::error::Result as DataFusionResult; -use datafusion::logical_expr::{ +use datafusion_common::ScalarValue; +use datafusion_common::error::Result as DataFusionResult; +use datafusion_expr::{ ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, Volatility, }; use datafusion_ffi::udf::FFI_ScalarUDF; use pyo3::types::PyCapsule; -use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; -use std::any::Any; -use std::sync::Arc; +use pyo3::{Bound, PyResult, Python, pyclass, pymethods}; -#[pyclass(name = "IsNullUDF", module = "datafusion_ffi_example", subclass)] -#[derive(Debug, Clone)] +#[pyclass( + from_py_object, + name = "IsNullUDF", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(crate) struct IsNullUDF { signature: Signature, } diff --git a/examples/datafusion-ffi-example/src/table_function.rs b/examples/datafusion-ffi-example/src/table_function.rs index 2d7b356e3..79c13f64d 100644 --- a/examples/datafusion-ffi-example/src/table_function.rs +++ b/examples/datafusion-ffi-example/src/table_function.rs @@ -15,16 +15,24 @@ // specific language governing permissions and limitations // under the License. -use crate::table_provider::MyTableProvider; -use datafusion::catalog::{TableFunctionImpl, TableProvider}; -use datafusion::error::Result as DataFusionResult; -use datafusion::prelude::Expr; +use std::sync::Arc; + +use datafusion_catalog::{TableFunctionImpl, TableProvider}; +use datafusion_common::error::Result as DataFusionResult; +use datafusion_expr::Expr; use datafusion_ffi::udtf::FFI_TableFunction; +use datafusion_python_util::ffi_logical_codec_from_pycapsule; use pyo3::types::PyCapsule; -use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; -use std::sync::Arc; +use pyo3::{Bound, PyAny, PyResult, Python, pyclass, pymethods}; + +use crate::table_provider::MyTableProvider; -#[pyclass(name = "MyTableFunction", module = "datafusion_ffi_example", subclass)] +#[pyclass( + from_py_object, + name = "MyTableFunction", + module = "datafusion_ffi_example", + subclass +)] #[derive(Debug, Clone)] pub(crate) struct MyTableFunction {} @@ -38,11 +46,13 @@ impl MyTableFunction { fn __datafusion_table_function__<'py>( &self, py: Python<'py>, + session: Bound, ) -> PyResult> { let name = cr"datafusion_table_function".into(); let func = self.clone(); - let provider = FFI_TableFunction::new(Arc::new(func), None); + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider = FFI_TableFunction::new_with_ffi_codec(Arc::new(func), None, codec); PyCapsule::new(py, provider, Some(name)) } diff --git a/examples/datafusion-ffi-example/src/table_provider.rs b/examples/datafusion-ffi-example/src/table_provider.rs index e884585b5..358ef7402 100644 --- a/examples/datafusion-ffi-example/src/table_provider.rs +++ b/examples/datafusion-ffi-example/src/table_provider.rs @@ -15,19 +15,26 @@ // specific language governing permissions and limitations // under the License. +use std::sync::Arc; + use arrow_array::{ArrayRef, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; -use datafusion::catalog::MemTable; -use datafusion::error::{DataFusionError, Result as DataFusionResult}; +use datafusion_catalog::MemTable; +use datafusion_common::error::{DataFusionError, Result as DataFusionResult}; use datafusion_ffi::table_provider::FFI_TableProvider; +use datafusion_python_util::ffi_logical_codec_from_pycapsule; use pyo3::exceptions::PyRuntimeError; use pyo3::types::PyCapsule; -use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; -use std::sync::Arc; +use pyo3::{Bound, PyAny, PyResult, Python, pyclass, pymethods}; /// In order to provide a test that demonstrates different sized record batches, /// the first batch will have num_rows, the second batch num_rows+1, and so on. -#[pyclass(name = "MyTableProvider", module = "datafusion_ffi_example", subclass)] +#[pyclass( + from_py_object, + name = "MyTableProvider", + module = "datafusion_ffi_example", + subclass +)] #[derive(Clone)] pub(crate) struct MyTableProvider { num_cols: usize, @@ -90,13 +97,17 @@ impl MyTableProvider { pub fn __datafusion_table_provider__<'py>( &self, py: Python<'py>, + session: Bound, ) -> PyResult> { let name = cr"datafusion_table_provider".into(); let provider = self .create_table() - .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; - let provider = FFI_TableProvider::new(Arc::new(provider), false, None); + .map_err(|e: DataFusionError| PyRuntimeError::new_err(e.to_string()))?; + + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider = + FFI_TableProvider::new_with_ffi_codec(Arc::new(provider), false, None, codec); PyCapsule::new(py, provider, Some(name)) } diff --git a/examples/datafusion-ffi-example/src/table_provider_factory.rs b/examples/datafusion-ffi-example/src/table_provider_factory.rs new file mode 100644 index 000000000..53248a905 --- /dev/null +++ b/examples/datafusion-ffi-example/src/table_provider_factory.rs @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion_catalog::{Session, TableProvider, TableProviderFactory}; +use datafusion_common::error::Result as DataFusionResult; +use datafusion_expr::CreateExternalTable; +use datafusion_ffi::table_provider_factory::FFI_TableProviderFactory; +use datafusion_python_util::ffi_logical_codec_from_pycapsule; +use pyo3::types::PyCapsule; +use pyo3::{Bound, PyAny, PyResult, Python, pyclass, pymethods}; + +use crate::catalog_provider; + +#[derive(Debug)] +pub(crate) struct ExampleTableProviderFactory {} + +impl ExampleTableProviderFactory { + fn new() -> Self { + Self {} + } +} + +#[async_trait] +impl TableProviderFactory for ExampleTableProviderFactory { + async fn create( + &self, + _state: &dyn Session, + _cmd: &CreateExternalTable, + ) -> DataFusionResult> { + Ok(catalog_provider::my_table()) + } +} + +#[pyclass( + name = "MyTableProviderFactory", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug)] +pub struct MyTableProviderFactory { + inner: Arc, +} + +impl Default for MyTableProviderFactory { + fn default() -> Self { + let inner = Arc::new(ExampleTableProviderFactory::new()); + Self { inner } + } +} + +#[pymethods] +impl MyTableProviderFactory { + #[new] + pub fn new() -> Self { + Self::default() + } + + pub fn __datafusion_table_provider_factory__<'py>( + &self, + py: Python<'py>, + codec: Bound, + ) -> PyResult> { + let name = cr"datafusion_table_provider_factory".into(); + let codec = ffi_logical_codec_from_pycapsule(codec)?; + let factory = Arc::clone(&self.inner) as Arc; + let factory = FFI_TableProviderFactory::new_with_ffi_codec(factory, None, codec); + + PyCapsule::new(py, factory, Some(name)) + } +} diff --git a/examples/datafusion-ffi-example/src/window_udf.rs b/examples/datafusion-ffi-example/src/window_udf.rs index e0d397956..cbf179a86 100644 --- a/examples/datafusion-ffi-example/src/window_udf.rs +++ b/examples/datafusion-ffi-example/src/window_udf.rs @@ -15,19 +15,25 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; +use std::sync::Arc; + use arrow_schema::{DataType, FieldRef}; -use datafusion::error::Result as DataFusionResult; -use datafusion::functions_window::rank::rank_udwf; -use datafusion::logical_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; -use datafusion::logical_expr::{PartitionEvaluator, Signature, WindowUDF, WindowUDFImpl}; +use datafusion_common::error::Result as DataFusionResult; +use datafusion_expr::function::{PartitionEvaluatorArgs, WindowUDFFieldArgs}; +use datafusion_expr::{PartitionEvaluator, Signature, WindowUDF, WindowUDFImpl}; use datafusion_ffi::udwf::FFI_WindowUDF; +use datafusion_functions_window::rank::rank_udwf; use pyo3::types::PyCapsule; -use pyo3::{pyclass, pymethods, Bound, PyResult, Python}; -use std::any::Any; -use std::sync::Arc; +use pyo3::{Bound, PyResult, Python, pyclass, pymethods}; -#[pyclass(name = "MyRankUDF", module = "datafusion_ffi_example", subclass)] -#[derive(Debug, Clone)] +#[pyclass( + from_py_object, + name = "MyRankUDF", + module = "datafusion_ffi_example", + subclass +)] +#[derive(Debug, Clone, Eq, PartialEq, Hash)] pub(crate) struct MyRankUDF { inner: Arc, } @@ -35,8 +41,8 @@ pub(crate) struct MyRankUDF { #[pymethods] impl MyRankUDF { #[new] - fn new() -> Self { - Self { inner: rank_udwf() } + fn new() -> PyResult { + Ok(Self { inner: rank_udwf() }) } fn __datafusion_window_udf__<'py>(&self, py: Python<'py>) -> PyResult> { diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py index eb0825011..b870645a3 100644 --- a/examples/python-udf-comparisons.py +++ b/examples/python-udf-comparisons.py @@ -15,16 +15,16 @@ # specific language governing permissions and limitations # under the License. -import os import time +from pathlib import Path import pyarrow as pa import pyarrow.compute as pc from datafusion import SessionContext, col, lit, udf from datafusion import functions as F -path = os.path.dirname(os.path.abspath(__file__)) -filepath = os.path.join(path, "./tpch/data/lineitem.parquet") +path = Path(__file__).parent.resolve() +filepath = path / "./tpch/data/lineitem.parquet" # This example serves to demonstrate alternate approaches to answering the # question "return all of the rows that have a specific combination of these diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py index 80ff80244..780fcf5e5 100644 --- a/examples/tpch/_tests.py +++ b/examples/tpch/_tests.py @@ -25,8 +25,10 @@ def df_selection(col_name, col_type): - if col_type == pa.float64() or isinstance(col_type, pa.Decimal128Type): + if col_type == pa.float64(): return F.round(col(col_name), lit(2)).alias(col_name) + if isinstance(col_type, pa.Decimal128Type): + return F.round(col(col_name).cast(pa.float64()), lit(2)).alias(col_name) if col_type == pa.string() or col_type == pa.string_view(): return F.trim(col(col_name)).alias(col_name) return col(col_name) diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index fd0fcca49..af554c39e 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -22,7 +22,7 @@ as will be generated by the script provided in this repository. """ -import os +from pathlib import Path import datafusion import pyarrow as pa @@ -116,7 +116,7 @@ ("S_COMMENT", pa.string()), ] -curr_dir = os.path.dirname(os.path.abspath(__file__)) +curr_dir = Path(__file__).resolve().parent for filename, curr_schema_val in all_schemas.items(): # For convenience, go ahead and convert the schema column names to lowercase curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val] @@ -132,10 +132,8 @@ schema = pa.schema(curr_schema) - source_file = os.path.abspath( - os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv") - ) - dest_file = os.path.abspath(os.path.join(curr_dir, f"./data/{filename}.parquet")) + source_file = (curr_dir / f"../../benchmarks/tpch/data/{filename}.csv").resolve() + dest_file = (curr_dir / f"./data/{filename}.parquet").resolve() df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|") diff --git a/examples/tpch/q07_volume_shipping.py b/examples/tpch/q07_volume_shipping.py index a84cf728a..ff2f891f1 100644 --- a/examples/tpch/q07_volume_shipping.py +++ b/examples/tpch/q07_volume_shipping.py @@ -80,7 +80,7 @@ # not match these will result in a null value and then get filtered out. # # To do the same using a simple filter would be: -# df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2)) +# df_nation = df_nation.filter((F.col("n_name") == nation_1) | (F.col("n_name") == nation_2)) # noqa: ERA001 df_nation = df_nation.with_column( "n_name", F.case(col("n_name")) diff --git a/examples/tpch/q12_ship_mode_order_priority.py b/examples/tpch/q12_ship_mode_order_priority.py index f1d894940..9071597f0 100644 --- a/examples/tpch/q12_ship_mode_order_priority.py +++ b/examples/tpch/q12_ship_mode_order_priority.py @@ -73,7 +73,7 @@ # matches either of the two values, but we want to show doing some array operations in this # example. If you want to see this done with filters, comment out the above line and uncomment # this one. -# df = df.filter((col("l_shipmode") == lit(SHIP_MODE_1)) | (col("l_shipmode") == lit(SHIP_MODE_2))) +# df = df.filter((col("l_shipmode") == lit(SHIP_MODE_1)) | (col("l_shipmode") == lit(SHIP_MODE_2))) # noqa: ERA001 # We need order priority, so join order df to line item diff --git a/examples/tpch/util.py b/examples/tpch/util.py index 7e3d659dd..ec53bcd15 100644 --- a/examples/tpch/util.py +++ b/examples/tpch/util.py @@ -19,18 +19,16 @@ Common utilities for running TPC-H examples. """ -import os +from pathlib import Path -def get_data_path(filename: str) -> str: - path = os.path.dirname(os.path.abspath(__file__)) +def get_data_path(filename: str) -> Path: + path = Path(__file__).resolve().parent - return os.path.join(path, "data", filename) + return path / "data" / filename -def get_answer_file(answer_file: str) -> str: - path = os.path.dirname(os.path.abspath(__file__)) +def get_answer_file(answer_file: str) -> Path: + path = Path(__file__).resolve().parent - return os.path.join( - path, "../../benchmarks/tpch/data/answers", f"{answer_file}.out" - ) + return path / "../../benchmarks/tpch/data/answers" / f"{answer_file}.out" diff --git a/pyproject.toml b/pyproject.toml index 69d31ec9f..117aeefc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,26 +24,30 @@ name = "datafusion" description = "Build and run queries against data" readme = "README.md" license = { file = "LICENSE.txt" } -requires-python = ">=3.9" -keywords = ["datafusion", "dataframe", "rust", "query-engine"] +requires-python = ">=3.10" +keywords = ["dataframe", "datafusion", "query-engine", "rust"] classifiers = [ - "Development Status :: 2 - Pre-Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "License :: OSI Approved", - "Operating System :: MacOS", - "Operating System :: Microsoft :: Windows", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python", - "Programming Language :: Rust", + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "License :: OSI Approved", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python", + "Programming Language :: Rust", +] +dependencies = [ + "pyarrow>=16.0.0;python_version<'3.14'", + "pyarrow>=22.0.0;python_version>='3.14'", + "typing-extensions;python_version<'3.13'", ] -dependencies = ["pyarrow>=11.0.0", "typing-extensions;python_version<'3.13'"] dynamic = ["version"] [project.urls] @@ -55,10 +59,11 @@ repository = "https://github.com/apache/datafusion-python" profile = "black" [tool.maturin] +manifest-path = "crates/core/Cargo.toml" python-source = "python" module-name = "datafusion._internal" include = [{ path = "Cargo.lock", format = "sdist" }] -exclude = [".github/**", "ci/**", ".asf.yaml"] +exclude = [".asf.yaml", ".github/**", "ci/**"] # Require Cargo.lock is up to date locked = true features = ["substrait"] @@ -66,39 +71,29 @@ features = ["substrait"] [tool.pytest.ini_options] asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "function" +addopts = "--doctest-modules" +doctest_optionflags = ["NORMALIZE_WHITESPACE", "ELLIPSIS"] +testpaths = ["python/tests", "python/datafusion"] # Enable docstring linting using the google style guide [tool.ruff.lint] -select = ["ALL" ] +select = ["ALL"] ignore = [ - "A001", # Allow using words like min as variable names - "A002", # Allow using words like filter as variable names - "ANN401", # Allow Any for wrapper classes - "COM812", # Recommended to ignore these rules when using with ruff-format - "FIX002", # Allow TODO lines - consider removing at some point - "FBT001", # Allow boolean positional args - "FBT002", # Allow boolean positional args - "ISC001", # Recommended to ignore these rules when using with ruff-format - "SLF001", # Allow accessing private members - "TD002", - "TD003", # Allow TODO lines - "UP007", # Disallowing Union is pedantic - # TODO: Enable all of the following, but this PR is getting too large already - "PLR0913", - "TRY003", - "PLR2004", - "PD901", - "ERA001", - "ANN001", - "ANN202", - "PTH", - "N812", - "INP001", - "DTZ007", - "RUF015", - "A005", - "TC001", - "UP035", + "A001", # Allow using words like min as variable names + "A002", # Allow using words like filter as variable names + "A005", # Allow module named io + "ANN401", # Allow Any for wrapper classes + "COM812", # Recommended to ignore these rules when using with ruff-format + "FBT001", # Allow boolean positional args + "FBT002", # Allow boolean positional args + "FIX002", # Allow TODO lines - consider removing at some point + "ISC001", # Recommended to ignore these rules when using with ruff-format + "N812", # Allow importing functions as `F` + "PD901", # Allow variable name df + "PLR0913", # Allow many arguments in function definition + "SLF001", # Allow accessing private members + "TD002", # Do not require author names in TODO statements + "TD003", # Allow TODO lines ] [tool.ruff.lint.pydocstyle] @@ -108,62 +103,104 @@ convention = "google" max-doc-length = 88 [tool.ruff.lint.flake8-boolean-trap] -extend-allowed-calls = ["lit", "datafusion.lit"] +extend-allowed-calls = ["datafusion.lit", "lit"] # Disable docstring checking for these directories [tool.ruff.lint.per-file-ignores] "python/tests/*" = [ - "ANN", - "ARG", - "BLE001", - "D", - "S101", - "SLF", - "PD", - "PLR2004", - "PT011", - "RUF015", - "S608", - "PLR0913", - "PT004", + "ANN", + "ARG", + "BLE001", + "D", + "PD", + "PLC0415", + "PLR0913", + "PLR2004", + "PT004", + "PT011", + "RUF015", + "S101", + "S608", + "SLF", +] +"examples/*" = [ + "ANN001", + "ANN202", + "D", + "DTZ007", + "E501", + "INP001", + "PLR2004", + "RUF015", + "S101", + "T201", + "W505", +] +"dev/*" = [ + "ANN001", + "C", + "D", + "E", + "ERA001", + "EXE", + "N817", + "PLR", + "S", + "SIM", + "T", + "UP", +] +"benchmarks/*" = [ + "ANN001", + "BLE", + "D", + "E", + "ERA001", + "EXE", + "F", + "FURB", + "INP001", + "PLR", + "S", + "SIM", + "T", + "TD", + "TRY", + "UP", ] -"examples/*" = ["D", "W505", "E501", "T201", "S101"] -"dev/*" = ["D", "E", "T", "S", "PLR", "C", "SIM", "UP", "EXE", "N817"] -"benchmarks/*" = ["D", "F", "T", "BLE", "FURB", "PLR", "E", "TD", "TRY", "S", "SIM", "EXE", "UP"] "docs/*" = ["D"] +"docs/source/conf.py" = ["ANN001", "ERA001", "INP001"] [tool.codespell] -skip = [ - "./target", - "uv.lock", - "./python/tests/test_functions.py" -] +skip = ["./python/tests/test_functions.py", "./target", "uv.lock"] count = true -ignore-words-list = [ - "ans", - "IST" -] +ignore-words-list = ["IST", "ans"] [dependency-groups] dev = [ - "maturin>=1.8.1", - "numpy>1.25.0", - "pre-commit>=4.0.0", - "pytest>=7.4.4", - "pytest-asyncio>=0.23.3", - "ruff>=0.9.1", - "toml>=0.10.2", - "pygithub==2.5.0", - "codespell==2.4.1", + "arro3-core==0.6.5", + "codespell==2.4.1", + "maturin>=1.8.1", + "nanoarrow==0.8.0", + "numpy>1.25.0;python_version<'3.14'", + "numpy>=2.3.2;python_version>='3.14'", + "pre-commit>=4.3.0", + "pyarrow>=19.0.0", + "pygithub==2.5.0", + "pytest-asyncio>=0.23.3", + "pytest>=7.4.4", + "pyyaml>=6.0.3", + "ruff>=0.9.1", + "toml>=0.10.2", ] docs = [ - "sphinx>=7.1.2", - "pydata-sphinx-theme==0.8.0", - "myst-parser>=3.0.1", - "jinja2>=3.1.5", - "ipython>=8.12.3", - "pandas>=2.0.3", - "pickleshare>=0.7.5", - "sphinx-autoapi>=3.4.0", - "setuptools>=75.3.0", + "ipython>=8.12.3", + "jinja2>=3.1.5", + "myst-parser>=3.0.1", + "pandas>=2.0.3", + "pickleshare>=0.7.5", + "pydata-sphinx-theme==0.8.0", + "setuptools>=75.3.0", + "sphinx-autoapi>=3.4.0", + "sphinx>=7.1.2", ] diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 77765223e..2e6f81166 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -54,6 +54,7 @@ from .dataframe_formatter import configure_formatter from .expr import Expr, WindowFrame from .io import read_avro, read_csv, read_json, read_parquet +from .options import CsvReadOptions from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream from .user_defined import ( @@ -75,6 +76,7 @@ "AggregateUDF", "Catalog", "Config", + "CsvReadOptions", "DFSchema", "DataFrame", "DataFrameWriteOptions", @@ -106,6 +108,7 @@ "lit", "literal", "object_store", + "options", "read_avro", "read_csv", "read_json", @@ -119,12 +122,12 @@ ] -def literal(value) -> Expr: +def literal(value: Any) -> Expr: """Create a literal expression.""" return Expr.literal(value) -def string_literal(value): +def string_literal(value: str) -> Expr: """Create a UTF8 literal expression. It differs from `literal` which creates a UTF8view literal. @@ -132,12 +135,12 @@ def string_literal(value): return Expr.string_literal(value) -def str_lit(value): +def str_lit(value: str) -> Expr: """Alias for `string_literal`.""" return string_literal(value) -def lit(value) -> Expr: +def lit(value: Any) -> Expr: """Create a literal expression.""" return Expr.literal(value) diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index da54d233d..03c0ddc68 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -27,8 +27,9 @@ if TYPE_CHECKING: import pyarrow as pa - from datafusion import DataFrame + from datafusion import DataFrame, SessionContext from datafusion.context import TableProviderExportable + from datafusion.expr import CreateExternalTable try: from warnings import deprecated # Python 3.13+ @@ -38,13 +39,61 @@ __all__ = [ "Catalog", + "CatalogList", "CatalogProvider", + "CatalogProviderList", "Schema", "SchemaProvider", "Table", ] +class CatalogList: + """DataFusion data catalog list.""" + + def __init__(self, catalog_list: df_internal.catalog.RawCatalogList) -> None: + """This constructor is not typically called by the end user.""" + self.catalog_list = catalog_list + + def __repr__(self) -> str: + """Print a string representation of the catalog list.""" + return self.catalog_list.__repr__() + + def names(self) -> set[str]: + """This is an alias for `catalog_names`.""" + return self.catalog_names() + + def catalog_names(self) -> set[str]: + """Returns the list of schemas in this catalog.""" + return self.catalog_list.catalog_names() + + @staticmethod + def memory_catalog(ctx: SessionContext | None = None) -> CatalogList: + """Create an in-memory catalog provider list.""" + catalog_list = df_internal.catalog.RawCatalogList.memory_catalog(ctx) + return CatalogList(catalog_list) + + def catalog(self, name: str = "datafusion") -> Catalog: + """Returns the catalog with the given ``name`` from this catalog.""" + catalog = self.catalog_list.catalog(name) + + return ( + Catalog(catalog) + if isinstance(catalog, df_internal.catalog.RawCatalog) + else catalog + ) + + def register_catalog( + self, + name: str, + catalog: Catalog | CatalogProvider | CatalogProviderExportable, + ) -> Catalog | None: + """Register a catalog with this catalog list.""" + if isinstance(catalog, Catalog): + return self.catalog_list.register_catalog(name, catalog.catalog) + return self.catalog_list.register_catalog(name, catalog) + + class Catalog: """DataFusion data catalog.""" @@ -65,9 +114,9 @@ def schema_names(self) -> set[str]: return self.catalog.schema_names() @staticmethod - def memory_catalog() -> Catalog: + def memory_catalog(ctx: SessionContext | None = None) -> Catalog: """Create an in-memory catalog provider.""" - catalog = df_internal.catalog.RawCatalog.memory_catalog() + catalog = df_internal.catalog.RawCatalog.memory_catalog(ctx) return Catalog(catalog) def schema(self, name: str = "public") -> Schema: @@ -112,9 +161,9 @@ def __repr__(self) -> str: return self._raw_schema.__repr__() @staticmethod - def memory_schema() -> Schema: + def memory_schema(ctx: SessionContext | None = None) -> Schema: """Create an in-memory schema provider.""" - schema = df_internal.catalog.RawSchema.memory_schema() + schema = df_internal.catalog.RawSchema.memory_schema(ctx) return Schema(schema) def names(self) -> set[str]: @@ -141,6 +190,10 @@ def deregister_table(self, name: str) -> None: """Deregister a table provider from this schema.""" return self._raw_schema.deregister_table(name) + def table_exist(self, name: str) -> bool: + """Determines if a table exists in this schema.""" + return self._raw_schema.table_exist(name) + @deprecated("Use `Schema` instead.") class Database(Schema): @@ -163,10 +216,12 @@ class Table: __slots__ = ("_inner",) def __init__( - self, table: Table | TableProviderExportable | DataFrame | pa.dataset.Dataset + self, + table: Table | TableProviderExportable | DataFrame | pa.dataset.Dataset, + ctx: SessionContext | None = None, ) -> None: """Constructor.""" - self._inner = df_internal.catalog.RawTable(table) + self._inner = df_internal.catalog.RawTable(table, ctx) def __repr__(self) -> str: """Print a string representation of the table.""" @@ -189,6 +244,58 @@ def kind(self) -> str: return self._inner.kind +class TableProviderFactory(ABC): + """Abstract class for defining a Python based Table Provider Factory.""" + + @abstractmethod + def create(self, cmd: CreateExternalTable) -> Table: + """Create a table using the :class:`CreateExternalTable`.""" + ... + + +class TableProviderFactoryExportable(Protocol): + """Type hint for object that has __datafusion_table_provider_factory__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProviderFactory.html + """ + + def __datafusion_table_provider_factory__(self, session: Any) -> object: ... + + +class CatalogProviderList(ABC): + """Abstract class for defining a Python based Catalog Provider List.""" + + @abstractmethod + def catalog_names(self) -> set[str]: + """Set of the names of all catalogs in this catalog list.""" + ... + + @abstractmethod + def catalog( + self, name: str + ) -> CatalogProviderExportable | CatalogProvider | Catalog | None: + """Retrieve a specific catalog from this catalog list.""" + ... + + def register_catalog( # noqa: B027 + self, name: str, catalog: CatalogProviderExportable | CatalogProvider | Catalog + ) -> None: + """Add a catalog to this catalog list. + + This method is optional. If your catalog provides a fixed list of catalogs, you + do not need to implement this method. + """ + + +class CatalogProviderListExportable(Protocol): + """Type hint for object that has __datafusion_catalog_provider_list__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.CatalogProviderList.html + """ + + def __datafusion_catalog_provider_list__(self, session: Any) -> object: ... + + class CatalogProvider(ABC): """Abstract class for defining a Python based Catalog Provider.""" @@ -223,6 +330,15 @@ def deregister_schema(self, name: str, cascade: bool) -> None: # noqa: B027 """ +class CatalogProviderExportable(Protocol): + """Type hint for object that has __datafusion_catalog_provider__ PyCapsule. + + https://docs.rs/datafusion/latest/datafusion/catalog/trait.CatalogProvider.html + """ + + def __datafusion_catalog_provider__(self, session: Any) -> object: ... + + class SchemaProvider(ABC): """Abstract class for defining a Python based Schema Provider.""" @@ -271,4 +387,4 @@ class SchemaProviderExportable(Protocol): https://docs.rs/datafusion/latest/datafusion/catalog/trait.SchemaProvider.html """ - def __datafusion_schema_provider__(self) -> object: ... + def __datafusion_schema_provider__(self, session: Any) -> object: ... diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 0aa2f27c4..ba9290a58 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -19,6 +19,7 @@ from __future__ import annotations +import uuid import warnings from typing import TYPE_CHECKING, Any, Protocol @@ -27,11 +28,25 @@ except ImportError: from typing_extensions import deprecated # Python 3.12 + import pyarrow as pa -from datafusion.catalog import Catalog +from datafusion.catalog import ( + Catalog, + CatalogList, + CatalogProviderExportable, + CatalogProviderList, + CatalogProviderListExportable, + TableProviderFactory, + TableProviderFactoryExportable, +) from datafusion.dataframe import DataFrame from datafusion.expr import sort_list_to_raw_sort_list +from datafusion.options import ( + DEFAULT_MAX_INFER_SCHEMA, + CsvReadOptions, + _convert_table_partition_cols, +) from datafusion.record_batch import RecordBatchStream from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal @@ -86,16 +101,7 @@ class TableProviderExportable(Protocol): https://datafusion.apache.org/python/user-guide/io/table_provider.html """ - def __datafusion_table_provider__(self) -> object: ... # noqa: D105 - - -class CatalogProviderExportable(Protocol): - """Type hint for object that has __datafusion_catalog_provider__ PyCapsule. - - https://docs.rs/datafusion/latest/datafusion/catalog/trait.CatalogProvider.html - """ - - def __datafusion_catalog_provider__(self) -> object: ... # noqa: D105 + def __datafusion_table_provider__(self, session: Any) -> object: ... # noqa: D105 class SessionConfig: @@ -582,7 +588,7 @@ def register_listing_table( """ if table_partition_cols is None: table_partition_cols = [] - table_partition_cols = self._convert_table_partition_cols(table_partition_cols) + table_partition_cols = _convert_table_partition_cols(table_partition_cols) self.ctx.register_listing_table( name, str(path), @@ -592,9 +598,19 @@ def register_listing_table( self._convert_file_sort_order(file_sort_order), ) - def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: + def sql( + self, + query: str, + options: SQLOptions | None = None, + param_values: dict[str, Any] | None = None, + **named_params: Any, + ) -> DataFrame: """Create a :py:class:`~datafusion.DataFrame` from SQL query text. + See the online documentation for a description of how to perform + parameterized substitution via either the ``param_values`` option + or passing in ``named_params``. + Note: This API implements DDL statements such as ``CREATE TABLE`` and ``CREATE VIEW`` and DML statements such as ``INSERT INTO`` with in-memory default implementation.See @@ -603,15 +619,57 @@ def sql(self, query: str, options: SQLOptions | None = None) -> DataFrame: Args: query: SQL query text. options: If provided, the query will be validated against these options. + param_values: Provides substitution of scalar values in the query + after parsing. + named_params: Provides string or DataFrame substitution in the query string. Returns: DataFrame representation of the SQL query. """ - if options is None: - return DataFrame(self.ctx.sql(query)) - return DataFrame(self.ctx.sql_with_options(query, options.options_internal)) - def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: + def value_to_scalar(value: Any) -> pa.Scalar: + if isinstance(value, pa.Scalar): + return value + return pa.scalar(value) + + def value_to_string(value: Any) -> str: + if isinstance(value, DataFrame): + view_name = str(uuid.uuid4()).replace("-", "_") + view_name = f"view_{view_name}" + view = value.df.into_view(temporary=True) + self.ctx.register_table(view_name, view) + return view_name + return str(value) + + param_values = ( + {name: value_to_scalar(value) for (name, value) in param_values.items()} + if param_values is not None + else {} + ) + param_strings = ( + {name: value_to_string(value) for (name, value) in named_params.items()} + if named_params is not None + else {} + ) + + options_raw = options.options_internal if options is not None else None + + return DataFrame( + self.ctx.sql_with_options( + query, + options=options_raw, + param_values=param_values, + param_strings=param_strings, + ) + ) + + def sql_with_options( + self, + query: str, + options: SQLOptions, + param_values: dict[str, Any] | None = None, + **named_params: Any, + ) -> DataFrame: """Create a :py:class:`~datafusion.dataframe.DataFrame` from SQL query text. This function will first validate that the query is allowed by the @@ -620,11 +678,16 @@ def sql_with_options(self, query: str, options: SQLOptions) -> DataFrame: Args: query: SQL query text. options: SQL options. + param_values: Provides substitution of scalar values in the query + after parsing. + named_params: Provides string or DataFrame substitution in the query string. Returns: DataFrame representation of the SQL query. """ - return self.sql(query, options) + return self.sql( + query, options=options, param_values=param_values, **named_params + ) def create_dataframe( self, @@ -769,10 +832,36 @@ def deregister_table(self, name: str) -> None: """Remove a table from the session.""" self.ctx.deregister_table(name) + def register_table_factory( + self, + format: str, + factory: TableProviderFactory | TableProviderFactoryExportable, + ) -> None: + """Register a :py:class:`~datafusion.TableProviderFactoryExportable`. + + The registered factory can be referenced from SQL DDL statements executed + against this context. + + Args: + format: The value to be used in `STORED AS ${format}` clause. + factory: A PyCapsule that implements :class:`TableProviderFactoryExportable` + """ + self.ctx.register_table_factory(format, factory) + def catalog_names(self) -> set[str]: """Returns the list of catalogs in this context.""" return self.ctx.catalog_names() + def register_catalog_provider_list( + self, + provider: CatalogProviderListExportable | CatalogProviderList | CatalogList, + ) -> None: + """Register a catalog provider list.""" + if isinstance(provider, CatalogList): + self.ctx.register_catalog_provider_list(provider.catalog) + else: + self.ctx.register_catalog_provider_list(provider) + def register_catalog_provider( self, name: str, provider: CatalogProviderExportable | CatalogProvider | Catalog ) -> None: @@ -846,7 +935,7 @@ def register_parquet( """ if table_partition_cols is None: table_partition_cols = [] - table_partition_cols = self._convert_table_partition_cols(table_partition_cols) + table_partition_cols = _convert_table_partition_cols(table_partition_cols) self.ctx.register_parquet( name, str(path), @@ -865,9 +954,10 @@ def register_csv( schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", - schema_infer_max_records: int = 1000, + schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA, file_extension: str = ".csv", file_compression_type: str | None = None, + options: CsvReadOptions | None = None, ) -> None: """Register a CSV file as a table. @@ -887,18 +977,46 @@ def register_csv( file_extension: File extension; only files with this extension are selected for data input. file_compression_type: File compression type. + options: Set advanced options for CSV reading. This cannot be + combined with any of the other options in this method. """ - path = [str(p) for p in path] if isinstance(path, list) else str(path) + path_arg = [str(p) for p in path] if isinstance(path, list) else str(path) + + if options is not None and ( + schema is not None + or not has_header + or delimiter != "," + or schema_infer_max_records != DEFAULT_MAX_INFER_SCHEMA + or file_extension != ".csv" + or file_compression_type is not None + ): + message = ( + "Combining CsvReadOptions parameter with additional options " + "is not supported. Use CsvReadOptions to set parameters." + ) + warnings.warn( + message, + category=UserWarning, + stacklevel=2, + ) + + options = ( + options + if options is not None + else CsvReadOptions( + schema=schema, + has_header=has_header, + delimiter=delimiter, + schema_infer_max_records=schema_infer_max_records, + file_extension=file_extension, + file_compression_type=file_compression_type, + ) + ) self.ctx.register_csv( name, - path, - schema, - has_header, - delimiter, - schema_infer_max_records, - file_extension, - file_compression_type, + path_arg, + options.to_inner(), ) def register_json( @@ -929,7 +1047,7 @@ def register_json( """ if table_partition_cols is None: table_partition_cols = [] - table_partition_cols = self._convert_table_partition_cols(table_partition_cols) + table_partition_cols = _convert_table_partition_cols(table_partition_cols) self.ctx.register_json( name, str(path), @@ -962,7 +1080,7 @@ def register_avro( """ if table_partition_cols is None: table_partition_cols = [] - table_partition_cols = self._convert_table_partition_cols(table_partition_cols) + table_partition_cols = _convert_table_partition_cols(table_partition_cols) self.ctx.register_avro( name, str(path), schema, file_extension, table_partition_cols ) @@ -1042,7 +1160,7 @@ def read_json( """ if table_partition_cols is None: table_partition_cols = [] - table_partition_cols = self._convert_table_partition_cols(table_partition_cols) + table_partition_cols = _convert_table_partition_cols(table_partition_cols) return DataFrame( self.ctx.read_json( str(path), @@ -1060,10 +1178,11 @@ def read_csv( schema: pa.Schema | None = None, has_header: bool = True, delimiter: str = ",", - schema_infer_max_records: int = 1000, + schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA, file_extension: str = ".csv", table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_compression_type: str | None = None, + options: CsvReadOptions | None = None, ) -> DataFrame: """Read a CSV data source. @@ -1081,26 +1200,51 @@ def read_csv( selected for data input. table_partition_cols: Partition columns. file_compression_type: File compression type. + options: Set advanced options for CSV reading. This cannot be + combined with any of the other options in this method. Returns: DataFrame representation of the read CSV files """ - if table_partition_cols is None: - table_partition_cols = [] - table_partition_cols = self._convert_table_partition_cols(table_partition_cols) + path_arg = [str(p) for p in path] if isinstance(path, list) else str(path) + + if options is not None and ( + schema is not None + or not has_header + or delimiter != "," + or schema_infer_max_records != DEFAULT_MAX_INFER_SCHEMA + or file_extension != ".csv" + or table_partition_cols is not None + or file_compression_type is not None + ): + message = ( + "Combining CsvReadOptions parameter with additional options " + "is not supported. Use CsvReadOptions to set parameters." + ) + warnings.warn( + message, + category=UserWarning, + stacklevel=2, + ) - path = [str(p) for p in path] if isinstance(path, list) else str(path) + options = ( + options + if options is not None + else CsvReadOptions( + schema=schema, + has_header=has_header, + delimiter=delimiter, + schema_infer_max_records=schema_infer_max_records, + file_extension=file_extension, + table_partition_cols=table_partition_cols, + file_compression_type=file_compression_type, + ) + ) return DataFrame( self.ctx.read_csv( - path, - schema, - has_header, - delimiter, - schema_infer_max_records, - file_extension, - table_partition_cols, - file_compression_type, + path_arg, + options.to_inner(), ) ) @@ -1138,7 +1282,7 @@ def read_parquet( """ if table_partition_cols is None: table_partition_cols = [] - table_partition_cols = self._convert_table_partition_cols(table_partition_cols) + table_partition_cols = _convert_table_partition_cols(table_partition_cols) file_sort_order = self._convert_file_sort_order(file_sort_order) return DataFrame( self.ctx.read_parquet( @@ -1172,7 +1316,7 @@ def read_avro( """ if file_partition_cols is None: file_partition_cols = [] - file_partition_cols = self._convert_table_partition_cols(file_partition_cols) + file_partition_cols = _convert_table_partition_cols(file_partition_cols) return DataFrame( self.ctx.read_avro(str(path), schema, file_partition_cols, file_extension) ) @@ -1242,3 +1386,19 @@ def _convert_table_partition_cols( ) return converted_table_partition_cols + + def __datafusion_task_context_provider__(self) -> Any: + """Access the PyCapsule FFI_TaskContextProvider.""" + return self.ctx.__datafusion_task_context_provider__() + + def __datafusion_logical_extension_codec__(self) -> Any: + """Access the PyCapsule FFI_LogicalExtensionCodec.""" + return self.ctx.__datafusion_logical_extension_codec__() + + def with_logical_extension_codec(self, codec: Any) -> SessionContext: + """Create a new session context with specified codec. + + This only supports codecs that have been implemented using the + FFI interface. + """ + return self.ctx.with_logical_extension_codec(codec) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index d15111d57..214d44a42 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -22,14 +22,11 @@ from __future__ import annotations import warnings -from collections.abc import Sequence +from collections.abc import AsyncIterator, Iterable, Iterator, Sequence from typing import ( TYPE_CHECKING, Any, - Iterable, Literal, - Optional, - Union, overload, ) @@ -53,11 +50,11 @@ sort_list_to_raw_sort_list, ) from datafusion.plan import ExecutionPlan, LogicalPlan -from datafusion.record_batch import RecordBatchStream +from datafusion.record_batch import RecordBatch, RecordBatchStream if TYPE_CHECKING: import pathlib - from typing import Callable + from collections.abc import Callable import pandas as pd import polars as pl @@ -80,7 +77,7 @@ class Compression(Enum): LZ4 = "lz4" # lzo is not implemented yet # https://github.com/apache/arrow-rs/issues/6970 - # LZO = "lzo" + # LZO = "lzo" # noqa: ERA001 ZSTD = "zstd" LZ4_RAW = "lz4_raw" @@ -107,7 +104,7 @@ def from_str(cls: type[Compression], value: str) -> Compression: """ raise ValueError(error_msg) from err - def get_default_level(self) -> Optional[int]: + def get_default_level(self) -> int | None: """Get the default compression level for the compression type. Returns: @@ -140,24 +137,24 @@ def __init__( write_batch_size: int = 1024, writer_version: str = "1.0", skip_arrow_metadata: bool = False, - compression: Optional[str] = "zstd(3)", - compression_level: Optional[int] = None, - dictionary_enabled: Optional[bool] = True, + compression: str | None = "zstd(3)", + compression_level: int | None = None, + dictionary_enabled: bool | None = True, dictionary_page_size_limit: int = 1024 * 1024, - statistics_enabled: Optional[str] = "page", + statistics_enabled: str | None = "page", max_row_group_size: int = 1024 * 1024, created_by: str = "datafusion-python", - column_index_truncate_length: Optional[int] = 64, - statistics_truncate_length: Optional[int] = None, + column_index_truncate_length: int | None = 64, + statistics_truncate_length: int | None = None, data_page_row_count_limit: int = 20_000, - encoding: Optional[str] = None, + encoding: str | None = None, bloom_filter_on_write: bool = False, - bloom_filter_fpp: Optional[float] = None, - bloom_filter_ndv: Optional[int] = None, + bloom_filter_fpp: float | None = None, + bloom_filter_ndv: int | None = None, allow_single_file_parallelism: bool = True, maximum_parallel_row_group_writers: int = 1, maximum_buffered_record_batches_per_stream: int = 2, - column_specific_options: Optional[dict[str, ParquetColumnOptions]] = None, + column_specific_options: dict[str, ParquetColumnOptions] | None = None, ) -> None: """Initialize the ParquetWriterOptions. @@ -262,13 +259,13 @@ class ParquetColumnOptions: def __init__( self, - encoding: Optional[str] = None, - dictionary_enabled: Optional[bool] = None, - compression: Optional[str] = None, - statistics_enabled: Optional[str] = None, - bloom_filter_enabled: Optional[bool] = None, - bloom_filter_fpp: Optional[float] = None, - bloom_filter_ndv: Optional[int] = None, + encoding: str | None = None, + dictionary_enabled: bool | None = None, + compression: str | None = None, + statistics_enabled: str | None = None, + bloom_filter_enabled: bool | None = None, + bloom_filter_fpp: float | None = None, + bloom_filter_ndv: int | None = None, ) -> None: """Initialize the ParquetColumnOptions. @@ -307,6 +304,9 @@ def __init__( class DataFrame: """Two dimensional table representation of data. + DataFrame objects are iterable; iterating over a DataFrame yields + :class:`datafusion.RecordBatch` instances lazily. + See :ref:`user_guide_concepts` in the online documentation for more information. """ @@ -318,7 +318,7 @@ def __init__(self, df: DataFrameInternal) -> None: """ self.df = df - def into_view(self) -> Table: + def into_view(self, temporary: bool = False) -> Table: """Convert ``DataFrame`` into a :class:`~datafusion.Table`. Examples: @@ -327,15 +327,16 @@ def into_view(self) -> Table: >>> df = ctx.sql("SELECT 1 AS value") >>> view = df.into_view() >>> ctx.register_table("values_view", view) - >>> df.collect() # The DataFrame is still usable - >>> ctx.sql("SELECT value FROM values_view").collect() + >>> result = ctx.sql("SELECT value FROM values_view").collect() + >>> result[0].column("value").to_pylist() + [1] """ from datafusion.catalog import Table as _Table - return _Table(self.df.into_view()) + return _Table(self.df.into_view(temporary)) def __getitem__(self, key: str | list[str]) -> DataFrame: - """Return a new :py:class`DataFrame` with the specified column or columns. + """Return a new :py:class:`DataFrame` with the specified column or columns. Args: key: Column name or list of column names to select. @@ -441,56 +442,52 @@ def select(self, *exprs: Expr | str) -> DataFrame: def drop(self, *columns: str) -> DataFrame: """Drop arbitrary amount of columns. - Column names are case-sensitive and do not require double quotes like - other operations such as `select`. Leading and trailing double quotes - are allowed and will be automatically stripped if present. + Column names are case-sensitive and require double quotes to be dropped + if the original name is not strictly lower case. Args: - columns: Column names to drop from the dataframe. Both ``column_name`` - and ``"column_name"`` are accepted. + columns: Column names to drop from the dataframe. Returns: DataFrame with those columns removed in the projection. Example Usage:: - - df.drop('ID_For_Students') # Works - df.drop('"ID_For_Students"') # Also works (quotes stripped) + df.drop('a') # To drop a lower-cased column 'a' + df.drop('"a"') # To drop an upper-cased column 'A' """ - normalized_columns = [] - for col in columns: - if col.startswith('"') and col.endswith('"'): - normalized_columns.append(col.strip('"')) # Strip double quotes - else: - normalized_columns.append(col) - - return DataFrame(self.df.drop(*normalized_columns)) + return DataFrame(self.df.drop(*columns)) - def filter(self, *predicates: Expr) -> DataFrame: + def filter(self, *predicates: Expr | str) -> DataFrame: """Return a DataFrame for which ``predicate`` evaluates to ``True``. Rows for which ``predicate`` evaluates to ``False`` or ``None`` are filtered out. If more than one predicate is provided, these predicates will be - combined as a logical AND. Each ``predicate`` must be an + combined as a logical AND. Each ``predicate`` can be an :class:`~datafusion.expr.Expr` created using helper functions such as - :func:`datafusion.col` or :func:`datafusion.lit`. - If more complex logic is required, see the logical operations in - :py:mod:`~datafusion.functions`. + :func:`datafusion.col` or :func:`datafusion.lit`, or a SQL expression string + that will be parsed against the DataFrame schema. If more complex logic is + required, see the logical operations in :py:mod:`~datafusion.functions`. Example:: from datafusion import col, lit df.filter(col("a") > lit(1)) + df.filter("a > 1") Args: - predicates: Predicate expression(s) to filter the DataFrame. + predicates: Predicate expression(s) or SQL strings to filter the DataFrame. Returns: DataFrame after filtering. """ df = self.df - for p in predicates: - df = df.filter(ensure_expr(p)) + for predicate in predicates: + expr = ( + self.parse_sql_expr(predicate) + if isinstance(predicate, str) + else predicate + ) + df = df.filter(ensure_expr(expr)) return DataFrame(df) def parse_sql_expr(self, expr: str) -> Expr: @@ -515,11 +512,12 @@ def parse_sql_expr(self, expr: str) -> Expr: """ return Expr(self.df.parse_sql_expr(expr)) - def with_column(self, name: str, expr: Expr) -> DataFrame: + def with_column(self, name: str, expr: Expr | str) -> DataFrame: """Add an additional column to the DataFrame. The ``expr`` must be an :class:`~datafusion.expr.Expr` constructed with - :func:`datafusion.col` or :func:`datafusion.lit`. + :func:`datafusion.col` or :func:`datafusion.lit`, or a SQL expression + string that will be parsed against the DataFrame schema. Example:: @@ -533,16 +531,19 @@ def with_column(self, name: str, expr: Expr) -> DataFrame: Returns: DataFrame with the new column. """ + expr = self.parse_sql_expr(expr) if isinstance(expr, str) else expr + return DataFrame(self.df.with_column(name, ensure_expr(expr))) def with_columns( - self, *exprs: Expr | Iterable[Expr], **named_exprs: Expr + self, *exprs: Expr | str | Iterable[Expr | str], **named_exprs: Expr | str ) -> DataFrame: """Add columns to the DataFrame. - By passing expressions, iterables of expressions, or named expressions. + By passing expressions, iterables of expressions, string SQL expressions, + or named expressions. All expressions must be :class:`~datafusion.expr.Expr` objects created via - :func:`datafusion.col` or :func:`datafusion.lit`. + :func:`datafusion.col` or :func:`datafusion.lit`, or SQL expression strings. To pass named expressions use the form ``name=Expr``. Example usage: The following will add 4 columns labeled ``a``, ``b``, ``c``, @@ -555,17 +556,44 @@ def with_columns( d=lit(3) ) + Equivalent example using just SQL strings: + + df = df.with_columns( + "x as a", + ["1 as b", "y as c"], + d="3" + ) + Args: - exprs: Either a single expression or an iterable of expressions to add. + exprs: Either a single expression, an iterable of expressions to add or + SQL expression strings. named_exprs: Named expressions in the form of ``name=expr`` Returns: DataFrame with the new columns added. """ - expressions = ensure_expr_list(exprs) + expressions = [] + for expr in exprs: + if isinstance(expr, str): + expressions.append(self.parse_sql_expr(expr).expr) + elif isinstance(expr, Iterable) and not isinstance( + expr, Expr | str | bytes | bytearray + ): + expressions.extend( + [ + self.parse_sql_expr(e).expr + if isinstance(e, str) + else ensure_expr(e) + for e in expr + ] + ) + else: + expressions.append(ensure_expr(expr)) + for alias, expr in named_exprs.items(): - ensure_expr(expr) - expressions.append(expr.alias(alias).expr) + e = self.parse_sql_expr(expr) if isinstance(expr, str) else expr + ensure_expr(e) + expressions.append(e.alias(alias).expr) return DataFrame(self.df.with_columns(expressions)) @@ -602,7 +630,7 @@ def aggregate( """ group_by_list = ( list(group_by) - if isinstance(group_by, Sequence) and not isinstance(group_by, (Expr, str)) + if isinstance(group_by, Sequence) and not isinstance(group_by, Expr | str) else [group_by] ) aggs_list = ( @@ -691,6 +719,10 @@ def collect(self) -> list[pa.RecordBatch]: """ return self.df.collect() + def collect_column(self, column_name: str) -> pa.Array | pa.ChunkedArray: + """Executes this :py:class:`DataFrame` for a single column.""" + return self.df.collect_column(column_name) + def cache(self) -> DataFrame: """Cache the DataFrame as a memory table. @@ -737,6 +769,7 @@ def join( left_on: None = None, right_on: None = None, join_keys: None = None, + coalesce_duplicate_keys: bool = True, ) -> DataFrame: ... @overload @@ -749,6 +782,7 @@ def join( left_on: str | Sequence[str], right_on: str | Sequence[str], join_keys: tuple[list[str], list[str]] | None = None, + coalesce_duplicate_keys: bool = True, ) -> DataFrame: ... @overload @@ -761,6 +795,7 @@ def join( join_keys: tuple[list[str], list[str]], left_on: None = None, right_on: None = None, + coalesce_duplicate_keys: bool = True, ) -> DataFrame: ... def join( @@ -772,6 +807,7 @@ def join( left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, join_keys: tuple[list[str], list[str]] | None = None, + coalesce_duplicate_keys: bool = True, ) -> DataFrame: """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`. @@ -784,33 +820,37 @@ def join( "right", "full", "semi", "anti". left_on: Join column of the left dataframe. right_on: Join column of the right dataframe. + coalesce_duplicate_keys: When True, coalesce the columns + from the right DataFrame and left DataFrame + that have identical names in the ``on`` fields. join_keys: Tuple of two lists of column names to join on. [Deprecated] Returns: DataFrame after join. """ + if join_keys is not None: + warnings.warn( + "`join_keys` is deprecated, use `on` or `left_on` with `right_on`", + category=DeprecationWarning, + stacklevel=2, + ) + left_on = join_keys[0] + right_on = join_keys[1] + # This check is to prevent breaking API changes where users prior to # DF 43.0.0 would pass the join_keys as a positional argument instead # of a keyword argument. if ( isinstance(on, tuple) - and len(on) == 2 + and len(on) == 2 # noqa: PLR2004 and isinstance(on[0], list) and isinstance(on[1], list) ): # We know this is safe because we've checked the types - join_keys = on # type: ignore[assignment] + left_on = on[0] + right_on = on[1] on = None - if join_keys is not None: - warnings.warn( - "`join_keys` is deprecated, use `on` or `left_on` with `right_on`", - category=DeprecationWarning, - stacklevel=2, - ) - left_on = join_keys[0] - right_on = join_keys[1] - if on is not None: if left_on is not None or right_on is not None: error_msg = "`left_on` or `right_on` should not provided with `on`" @@ -829,7 +869,9 @@ def join( if isinstance(right_on, str): right_on = [right_on] - return DataFrame(self.df.join(right.df, how, left_on, right_on)) + return DataFrame( + self.df.join(right.df, how, left_on, right_on, coalesce_duplicate_keys) + ) def join_on( self, @@ -908,17 +950,20 @@ def repartition(self, num: int) -> DataFrame: """ return DataFrame(self.df.repartition(num)) - def repartition_by_hash(self, *exprs: Expr, num: int) -> DataFrame: + def repartition_by_hash(self, *exprs: Expr | str, num: int) -> DataFrame: """Repartition a DataFrame using a hash partitioning scheme. Args: - exprs: Expressions to evaluate and perform hashing on. + exprs: Expressions or a SQL expression string to evaluate + and perform hashing on. num: Number of partitions to repartition the DataFrame into. Returns: Repartitioned DataFrame. """ - exprs = [expr.expr for expr in exprs] + exprs = [self.parse_sql_expr(e) if isinstance(e, str) else e for e in exprs] + exprs = expr_list_to_raw_expr_list(exprs) + return DataFrame(self.df.repartition_by_hash(*exprs, num=num)) def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: @@ -1023,7 +1068,7 @@ def write_parquet( def write_parquet( self, path: str | pathlib.Path, - compression: Union[str, Compression, ParquetWriterOptions] = Compression.ZSTD, + compression: str | Compression | ParquetWriterOptions = Compression.ZSTD, compression_level: int | None = None, write_options: DataFrameWriteOptions | None = None, ) -> None: @@ -1254,21 +1299,54 @@ def unnest_columns(self, *columns: str, preserve_nulls: bool = True) -> DataFram return DataFrame(self.df.unnest_columns(columns, preserve_nulls=preserve_nulls)) def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: - """Export an Arrow PyCapsule Stream. + """Export the DataFrame as an Arrow C Stream. + + The DataFrame is executed using DataFusion's streaming APIs and exposed via + Arrow's C Stream interface. Record batches are produced incrementally, so the + full result set is never materialized in memory. - This will execute and collect the DataFrame. We will attempt to respect the - requested schema, but only trivial transformations will be applied such as only - returning the fields listed in the requested schema if their data types match - those in the DataFrame. + When ``requested_schema`` is provided, DataFusion applies only simple + projections such as selecting a subset of existing columns or reordering + them. Column renaming, computed expressions, or type coercion are not + supported through this interface. Args: - requested_schema: Attempt to provide the DataFrame using this schema. + requested_schema: Either a :py:class:`pyarrow.Schema` or an Arrow C + Schema capsule (``PyCapsule``) produced by + ``schema._export_to_c_capsule()``. The DataFrame will attempt to + align its output with the fields and order specified by this schema. Returns: - Arrow PyCapsule object. + Arrow ``PyCapsule`` object representing an ``ArrowArrayStream``. + + For practical usage patterns, see the Apache Arrow streaming + documentation: https://arrow.apache.org/docs/python/ipc.html#streaming. + + For details on DataFusion's Arrow integration and DataFrame streaming, + see the user guide (user-guide/io/arrow and user-guide/dataframe/index). + + Notes: + The Arrow C Data Interface PyCapsule details are documented by Apache + Arrow and can be found at: + https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html """ + # ``DataFrame.__arrow_c_stream__`` in the Rust extension leverages + # ``execute_stream_partitioned`` under the hood to stream batches while + # preserving the original partition order. return self.df.__arrow_c_stream__(requested_schema) + def __iter__(self) -> Iterator[RecordBatch]: + """Return an iterator over this DataFrame's record batches.""" + return iter(self.execute_stream()) + + def __aiter__(self) -> AsyncIterator[RecordBatch]: + """Return an async iterator over this DataFrame's record batches. + + We're using __aiter__ because we support Python < 3.10 where aiter() is not + available. + """ + return self.execute_stream().__aiter__() + def transform(self, func: Callable[..., DataFrame], *args: Any) -> DataFrame: """Apply a function to the current DataFrame which returns another DataFrame. @@ -1302,9 +1380,12 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: DataFrame with null values replaced where type casting is possible Examples: - >>> df = df.fill_null(0) # Fill all nulls with 0 where possible - >>> # Fill nulls in specific string columns - >>> df = df.fill_null("missing", subset=["name", "category"]) + >>> from datafusion import SessionContext, col + >>> ctx = SessionContext() + >>> df = ctx.from_pydict({"a": [1, None, 3], "b": [None, 5, 6]}) + >>> filled = df.fill_null(0) + >>> filled.sort(col("a")).collect()[0].column("a").to_pylist() + [0, 1, 3] Notes: - Only fills nulls in columns where the value can be cast to the column type diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index 2323224b8..b8af45a1b 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -18,16 +18,19 @@ from __future__ import annotations +import warnings from typing import ( + TYPE_CHECKING, Any, - Callable, - Optional, Protocol, runtime_checkable, ) from datafusion._internal import DataFrame as DataFrameInternal +if TYPE_CHECKING: + from collections.abc import Callable + def _validate_positive_int(value: Any, param_name: str) -> None: """Validate that a parameter is a positive integer. @@ -59,6 +62,93 @@ def _validate_bool(value: Any, param_name: str) -> None: raise TypeError(msg) +def _validate_formatter_parameters( + max_cell_length: int, + max_width: int, + max_height: int, + max_memory_bytes: int, + min_rows: int, + max_rows: int | None, + repr_rows: int | None, + enable_cell_expansion: bool, + show_truncation_message: bool, + use_shared_styles: bool, + custom_css: str | None, + style_provider: Any, +) -> int: + """Validate all formatter parameters and return resolved max_rows value. + + Args: + max_cell_length: Maximum cell length value to validate + max_width: Maximum width value to validate + max_height: Maximum height value to validate + max_memory_bytes: Maximum memory bytes value to validate + min_rows: Minimum rows to display value to validate + max_rows: Maximum rows value to validate (None means use default) + repr_rows: Deprecated repr_rows value to validate + enable_cell_expansion: Boolean expansion flag to validate + show_truncation_message: Boolean message flag to validate + use_shared_styles: Boolean styles flag to validate + custom_css: Custom CSS string to validate + style_provider: Style provider object to validate + + Returns: + The resolved max_rows value after handling repr_rows deprecation + + Raises: + ValueError: If any numeric parameter is invalid or constraints are violated + TypeError: If any parameter has invalid type + DeprecationWarning: If repr_rows parameter is used + """ + # Validate numeric parameters + _validate_positive_int(max_cell_length, "max_cell_length") + _validate_positive_int(max_width, "max_width") + _validate_positive_int(max_height, "max_height") + _validate_positive_int(max_memory_bytes, "max_memory_bytes") + _validate_positive_int(min_rows, "min_rows") + + # Handle deprecated repr_rows parameter + if repr_rows is not None: + warnings.warn( + "repr_rows parameter is deprecated, use max_rows instead", + DeprecationWarning, + stacklevel=4, + ) + _validate_positive_int(repr_rows, "repr_rows") + if max_rows is not None and repr_rows != max_rows: + msg = "Cannot specify both repr_rows and max_rows; use max_rows only" + raise ValueError(msg) + max_rows = repr_rows + + # Use default if max_rows was not provided + if max_rows is None: + max_rows = 10 + + _validate_positive_int(max_rows, "max_rows") + + # Validate constraint: min_rows <= max_rows + if min_rows > max_rows: + msg = "min_rows must be less than or equal to max_rows" + raise ValueError(msg) + + # Validate boolean parameters + _validate_bool(enable_cell_expansion, "enable_cell_expansion") + _validate_bool(show_truncation_message, "show_truncation_message") + _validate_bool(use_shared_styles, "use_shared_styles") + + # Validate custom_css + if custom_css is not None and not isinstance(custom_css, str): + msg = "custom_css must be None or a string" + raise TypeError(msg) + + # Validate style_provider + if style_provider is not None and not isinstance(style_provider, StyleProvider): + msg = "style_provider must implement the StyleProvider protocol" + raise TypeError(msg) + + return max_rows + + @runtime_checkable class CellFormatter(Protocol): """Protocol for cell value formatters.""" @@ -124,8 +214,9 @@ class DataFrameHtmlFormatter: max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display - repr_rows: Default number of rows to display in repr output + min_rows: Minimum number of rows to display (must be <= max_rows) + max_rows: Maximum number of rows to display in repr output + repr_rows: Deprecated alias for max_rows enable_cell_expansion: Whether to add expand/collapse buttons for long cell values custom_css: Additional CSS to include in the HTML output @@ -141,83 +232,83 @@ def __init__( max_width: int = 1000, max_height: int = 300, max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 20, - repr_rows: int = 10, + min_rows: int = 10, + max_rows: int | None = None, + repr_rows: int | None = None, enable_cell_expansion: bool = True, - custom_css: Optional[str] = None, + custom_css: str | None = None, show_truncation_message: bool = True, - style_provider: Optional[StyleProvider] = None, + style_provider: StyleProvider | None = None, use_shared_styles: bool = True, ) -> None: """Initialize the HTML formatter. Parameters ---------- - max_cell_length : int, default 25 + max_cell_length Maximum length of cell content before truncation. - max_width : int, default 1000 + max_width Maximum width of the displayed table in pixels. - max_height : int, default 300 + max_height Maximum height of the displayed table in pixels. - max_memory_bytes : int, default 2097152 (2MB) - Maximum memory in bytes for rendered data. - min_rows_display : int, default 20 - Minimum number of rows to display. - repr_rows : int, default 10 - Default number of rows to display in repr output. - enable_cell_expansion : bool, default True + max_memory_bytes + Maximum memory in bytes for rendered data. Helps prevent performance + issues with large datasets. + min_rows + Minimum number of rows to display even if memory limit is reached. + Must not exceed ``max_rows``. + max_rows + Maximum number of rows to display. Takes precedence over memory limits + when fewer rows are requested. + repr_rows + Deprecated alias for ``max_rows``. Use ``max_rows`` instead. + enable_cell_expansion Whether to allow cells to expand when clicked. - custom_css : str, optional + custom_css Custom CSS to apply to the HTML table. - show_truncation_message : bool, default True + show_truncation_message Whether to show a message indicating that content has been truncated. - style_provider : StyleProvider, optional + style_provider Provider of CSS styles for the HTML table. If None, DefaultStyleProvider is used. - use_shared_styles : bool, default True - Whether to use shared styles across multiple tables. + use_shared_styles + Whether to use shared styles across multiple tables. This improves + performance when displaying many DataFrames in a single notebook. Raises: ------ ValueError If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display, or repr_rows is not a positive integer. + min_rows or max_rows is not a positive integer, or if min_rows + exceeds max_rows. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is - not a boolean, - or if custom_css is provided but is not a string, - or if style_provider is provided but does not implement the StyleProvider + not a boolean, or if custom_css is provided but is not a string, or if + style_provider is provided but does not implement the StyleProvider protocol. """ - # Validate numeric parameters - _validate_positive_int(max_cell_length, "max_cell_length") - _validate_positive_int(max_width, "max_width") - _validate_positive_int(max_height, "max_height") - _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") - _validate_positive_int(repr_rows, "repr_rows") - - # Validate boolean parameters - _validate_bool(enable_cell_expansion, "enable_cell_expansion") - _validate_bool(show_truncation_message, "show_truncation_message") - _validate_bool(use_shared_styles, "use_shared_styles") - - # Validate custom_css - if custom_css is not None and not isinstance(custom_css, str): - msg = "custom_css must be None or a string" - raise TypeError(msg) - - # Validate style_provider - if style_provider is not None and not isinstance(style_provider, StyleProvider): - msg = "style_provider must implement the StyleProvider protocol" - raise TypeError(msg) + # Validate all parameters and get resolved max_rows + resolved_max_rows = _validate_formatter_parameters( + max_cell_length, + max_width, + max_height, + max_memory_bytes, + min_rows, + max_rows, + repr_rows, + enable_cell_expansion, + show_truncation_message, + use_shared_styles, + custom_css, + style_provider, + ) self.max_cell_length = max_cell_length self.max_width = max_width self.max_height = max_height self.max_memory_bytes = max_memory_bytes - self.min_rows_display = min_rows_display - self.repr_rows = repr_rows + self.min_rows = min_rows + self._max_rows = resolved_max_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css self.show_truncation_message = show_truncation_message @@ -226,8 +317,57 @@ def __init__( # Registry for custom type formatters self._type_formatters: dict[type, CellFormatter] = {} # Custom cell builders - self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None - self._custom_header_builder: Optional[Callable[[Any], str]] = None + self._custom_cell_builder: Callable[[Any, int, int, str], str] | None = None + self._custom_header_builder: Callable[[Any], str] | None = None + + @property + def max_rows(self) -> int: + """Get the maximum number of rows to display. + + Returns: + The maximum number of rows to display in repr output + """ + return self._max_rows + + @max_rows.setter + def max_rows(self, value: int) -> None: + """Set the maximum number of rows to display. + + Args: + value: The maximum number of rows + """ + self._max_rows = value + + @property + def repr_rows(self) -> int: + """Get the maximum number of rows (deprecated name). + + .. deprecated:: + Use :attr:`max_rows` instead. This property is provided for + backward compatibility. + + Returns: + The maximum number of rows to display + """ + return self._max_rows + + @repr_rows.setter + def repr_rows(self, value: int) -> None: + """Set the maximum number of rows using deprecated name. + + .. deprecated:: + Use :attr:`max_rows` setter instead. This property is provided for + backward compatibility. + + Args: + value: The maximum number of rows + """ + warnings.warn( + "repr_rows is deprecated, use max_rows instead", + DeprecationWarning, + stacklevel=2, + ) + self._max_rows = value def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: """Register a custom formatter for a specific data type. @@ -368,7 +508,7 @@ def _build_table_container_start(self) -> list[str]: f"max-height: {self.max_height}px; overflow: auto; border: " '1px solid #ccc;">' ) - html.append('') + html.append('
') return html def _build_table_header(self, schema: Any) -> list[str]: @@ -657,7 +797,8 @@ def configure_formatter(**kwargs: Any) -> None: "max_width", "max_height", "max_memory_bytes", - "min_rows_display", + "min_rows", + "max_rows", "repr_rows", "enable_cell_expansion", "custom_css", diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 82e30a78c..5760b8948 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -20,10 +20,12 @@ See :ref:`Expressions` in the online documentation for more details. """ +# ruff: noqa: PLC0415 + from __future__ import annotations -import typing as _typing -from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Optional, Sequence +from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING, Any, ClassVar try: from warnings import deprecated # Python 3.13+ @@ -230,7 +232,7 @@ ] -def ensure_expr(value: _typing.Union[Expr, Any]) -> expr_internal.Expr: +def ensure_expr(value: Expr | Any) -> expr_internal.Expr: """Return the internal expression from ``Expr`` or raise ``TypeError``. This helper rejects plain strings and other non-:class:`Expr` values so @@ -252,7 +254,7 @@ def ensure_expr(value: _typing.Union[Expr, Any]) -> expr_internal.Expr: def ensure_expr_list( - exprs: Iterable[_typing.Union[Expr, Iterable[Expr]]], + exprs: Iterable[Expr | Iterable[Expr]], ) -> list[expr_internal.Expr]: """Flatten an iterable of expressions, validating each via ``ensure_expr``. @@ -267,11 +269,11 @@ def ensure_expr_list( """ def _iter( - items: Iterable[_typing.Union[Expr, Iterable[Expr]]], + items: Iterable[Expr | Iterable[Expr]], ) -> Iterable[expr_internal.Expr]: for expr in items: if isinstance(expr, Iterable) and not isinstance( - expr, (Expr, str, bytes, bytearray) + expr, Expr | str | bytes | bytearray ): # Treat string-like objects as atomic to surface standard errors yield from _iter(expr) @@ -281,7 +283,7 @@ def _iter( return list(_iter(exprs)) -def _to_raw_expr(value: _typing.Union[Expr, str]) -> expr_internal.Expr: +def _to_raw_expr(value: Expr | str) -> expr_internal.Expr: """Convert a Python expression or column name to its raw variant. Args: @@ -305,17 +307,17 @@ def _to_raw_expr(value: _typing.Union[Expr, str]) -> expr_internal.Expr: def expr_list_to_raw_expr_list( - expr_list: Optional[list[Expr] | Expr], -) -> Optional[list[expr_internal.Expr]]: + expr_list: list[Expr] | Expr | None, +) -> list[expr_internal.Expr] | None: """Convert a sequence of expressions or column names to raw expressions.""" - if isinstance(expr_list, (Expr, str)): + if isinstance(expr_list, Expr | str): expr_list = [expr_list] if expr_list is None: return None return [_to_raw_expr(e) for e in expr_list] -def sort_or_default(e: _typing.Union[Expr, SortExpr]) -> expr_internal.SortExpr: +def sort_or_default(e: Expr | SortExpr) -> expr_internal.SortExpr: """Helper function to return a default Sort if an Expr is provided.""" if isinstance(e, SortExpr): return e.raw_sort @@ -323,10 +325,10 @@ def sort_or_default(e: _typing.Union[Expr, SortExpr]) -> expr_internal.SortExpr: def sort_list_to_raw_sort_list( - sort_list: Optional[_typing.Union[Sequence[SortKey], SortKey]], -) -> Optional[list[expr_internal.SortExpr]]: + sort_list: Sequence[SortKey] | SortKey | None, +) -> list[expr_internal.SortExpr] | None: """Helper function to return an optional sort list to raw variant.""" - if isinstance(sort_list, (Expr, SortExpr, str)): + if isinstance(sort_list, Expr | SortExpr | str): sort_list = [sort_list] if sort_list is None: return None @@ -562,8 +564,6 @@ def literal(value: Any) -> Expr: """ if isinstance(value, str): value = pa.scalar(value, type=pa.string_view()) - if not isinstance(value, pa.Scalar): - value = pa.scalar(value) return Expr(expr_internal.RawExpr.literal(value)) @staticmethod @@ -576,7 +576,6 @@ def literal_with_metadata(value: Any, metadata: dict[str, str]) -> Expr: """ if isinstance(value, str): value = pa.scalar(value, type=pa.string_view()) - value = value if isinstance(value, pa.Scalar) else pa.scalar(value) return Expr(expr_internal.RawExpr.literal_with_metadata(value, metadata)) @@ -601,7 +600,7 @@ def column(value: str) -> Expr: """Creates a new expression representing a column.""" return Expr(expr_internal.RawExpr.column(value)) - def alias(self, name: str, metadata: Optional[dict[str, str]] = None) -> Expr: + def alias(self, name: str, metadata: dict[str, str] | None = None) -> Expr: """Assign a name to the expression. Args: @@ -630,13 +629,13 @@ def is_not_null(self) -> Expr: """Returns ``True`` if this expression is not null.""" return Expr(self.expr.is_not_null()) - def fill_nan(self, value: Optional[_typing.Union[Any, Expr]] = None) -> Expr: + def fill_nan(self, value: Any | Expr | None = None) -> Expr: """Fill NaN values with a provided value.""" if not isinstance(value, Expr): value = Expr.literal(value) return Expr(functions_internal.nanvl(self.expr, value.expr)) - def fill_null(self, value: Optional[_typing.Union[Any, Expr]] = None) -> Expr: + def fill_null(self, value: Any | Expr | None = None) -> Expr: """Fill NULL values with a provided value.""" if not isinstance(value, Expr): value = Expr.literal(value) @@ -649,7 +648,7 @@ def fill_null(self, value: Optional[_typing.Union[Any, Expr]] = None) -> Expr: bool: pa.bool_(), } - def cast(self, to: _typing.Union[pa.DataType[Any], type]) -> Expr: + def cast(self, to: pa.DataType[Any] | type) -> Expr: """Cast to a new data type.""" if not isinstance(to, pa.DataType): try: @@ -695,7 +694,7 @@ def types(self) -> DataTypeMap: return self.expr.types() def python_value(self) -> Any: - """Extracts the Expr value into a PyObject. + """Extracts the Expr value into `Any`. This is only valid for literal expressions. @@ -722,7 +721,7 @@ def column_name(self, plan: LogicalPlan) -> str: """Compute the output column name based on the provided logical plan.""" return self.expr.column_name(plan._raw_plan) - def order_by(self, *exprs: _typing.Union[Expr, SortExpr]) -> ExprFuncBuilder: + def order_by(self, *exprs: Expr | SortExpr) -> ExprFuncBuilder: """Set the ordering for a window or aggregate function. This function will create an :py:class:`ExprFuncBuilder` that can be used to @@ -1271,17 +1270,10 @@ class Window: def __init__( self, - partition_by: Optional[_typing.Union[list[Expr], Expr]] = None, - window_frame: Optional[WindowFrame] = None, - order_by: Optional[ - _typing.Union[ - list[_typing.Union[SortExpr, Expr, str]], - Expr, - SortExpr, - str, - ] - ] = None, - null_treatment: Optional[NullTreatment] = None, + partition_by: list[Expr] | Expr | None = None, + window_frame: WindowFrame | None = None, + order_by: list[SortExpr | Expr | str] | Expr | SortExpr | str | None = None, + null_treatment: NullTreatment | None = None, ) -> None: """Construct a window definition. @@ -1301,7 +1293,7 @@ class WindowFrame: """Defines a window frame for performing window operations.""" def __init__( - self, units: str, start_bound: Optional[Any], end_bound: Optional[Any] + self, units: str, start_bound: Any | None, end_bound: Any | None ) -> None: """Construct a window frame using the given parameters. @@ -1351,7 +1343,7 @@ def __init__(self, frame_bound: expr_internal.WindowFrameBound) -> None: """Constructs a window frame bound.""" self.frame_bound = frame_bound - def get_offset(self) -> Optional[int]: + def get_offset(self) -> int | None: """Returns the offset of the window frame.""" return self.frame_bound.get_offset() @@ -1435,4 +1427,4 @@ def __repr__(self) -> str: return self.raw_sort.__repr__() -SortKey = _typing.Union[Expr, SortExpr, str] +SortKey = Expr | SortExpr | str diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 472a02fcb..a4933a747 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -18,7 +18,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Any import pyarrow as pa @@ -42,7 +42,6 @@ if TYPE_CHECKING: from datafusion.context import SessionContext - __all__ = [ "abs", "acos", @@ -225,6 +224,7 @@ "range", "rank", "regexp_count", + "regexp_instr", "regexp_like", "regexp_match", "regexp_replace", @@ -267,13 +267,18 @@ "sum", "tan", "tanh", + "to_char", + "to_date", "to_hex", + "to_local_time", + "to_time", "to_timestamp", "to_timestamp_micros", "to_timestamp_millis", "to_timestamp_nanos", "to_timestamp_seconds", "to_unixtime", + "today", "translate", "trim", "trunc", @@ -290,7 +295,15 @@ def isnan(expr: Expr) -> Expr: - """Returns true if a given number is +NaN or -NaN otherwise returns false.""" + """Returns true if a given number is +NaN or -NaN otherwise returns false. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, np.nan]}) + >>> result = df.select(dfn.functions.isnan(dfn.col("a")).alias("isnan")) + >>> result.collect_column("isnan")[1].as_py() + True + """ return Expr(f.isnan(expr.expr)) @@ -298,29 +311,65 @@ def nullif(expr1: Expr, expr2: Expr) -> Expr: """Returns NULL if expr1 equals expr2; otherwise it returns expr1. This can be used to perform the inverse operation of the COALESCE expression. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2], "b": [1, 3]}) + >>> result = df.select( + ... dfn.functions.nullif(dfn.col("a"), dfn.col("b")).alias("nullif")) + >>> result.collect_column("nullif").to_pylist() + [None, 2] """ return Expr(f.nullif(expr1.expr, expr2.expr)) def encode(expr: Expr, encoding: Expr) -> Expr: - """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" + """Encode the ``input``, using the ``encoding``. encoding can be base64 or hex. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.encode(dfn.col("a"), dfn.lit("base64")).alias("enc")) + >>> result.collect_column("enc")[0].as_py() + 'aGVsbG8' + """ return Expr(f.encode(expr.expr, encoding.expr)) def decode(expr: Expr, encoding: Expr) -> Expr: - """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex.""" + """Decode the ``input``, using the ``encoding``. encoding can be base64 or hex. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["aGVsbG8="]}) + >>> result = df.select( + ... dfn.functions.decode(dfn.col("a"), dfn.lit("base64")).alias("dec")) + >>> result.collect_column("dec")[0].as_py() + b'hello' + """ return Expr(f.decode(expr.expr, encoding.expr)) def array_to_string(expr: Expr, delimiter: Expr) -> Expr: - """Converts each element to its text representation.""" + """Converts each element to its text representation. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_to_string(dfn.col("a"), dfn.lit(",")).alias("s")) + >>> result.collect_column("s")[0].as_py() + '1,2,3' + """ return Expr(f.array_to_string(expr.expr, delimiter.expr.cast(pa.string()))) def array_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :py:func:`array_to_string`. + See Also: + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) @@ -328,7 +377,8 @@ def array_join(expr: Expr, delimiter: Expr) -> Expr: def list_to_string(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. - This is an alias for :py:func:`array_to_string`. + See Also: + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) @@ -337,12 +387,27 @@ def list_join(expr: Expr, delimiter: Expr) -> Expr: """Converts each element to its text representation. This is an alias for :py:func:`array_to_string`. + + See Also: + This is an alias for :py:func:`array_to_string`. """ return array_to_string(expr, delimiter) def in_list(arg: Expr, values: list[Expr], negated: bool = False) -> Expr: - """Returns whether the argument is contained within the list ``values``.""" + """Returns whether the argument is contained within the list ``values``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.functions.in_list( + ... dfn.col("a"), [dfn.lit(1), dfn.lit(3)] + ... ).alias("in") + ... ) + >>> result.collect_column("in").to_pylist() + [True, False, True] + """ values = [v.expr for v in values] return Expr(f.in_list(arg.expr, values, negated)) @@ -352,6 +417,14 @@ def digest(value: Expr, method: Expr) -> Expr: Standard algorithms are md5, sha224, sha256, sha384, sha512, blake2s, blake2b, and blake3. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.digest(dfn.col("a"), dfn.lit("md5")).alias("d")) + >>> len(result.collect_column("d")[0].as_py()) > 0 + True """ return Expr(f.digest(value.expr, method.expr)) @@ -360,6 +433,15 @@ def concat(*args: Expr) -> Expr: """Concatenates the text representations of all the arguments. NULL arguments are ignored. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"], "b": [" world"]}) + >>> result = df.select( + ... dfn.functions.concat(dfn.col("a"), dfn.col("b")).alias("c") + ... ) + >>> result.collect_column("c")[0].as_py() + 'hello world' """ args = [arg.expr for arg in args] return Expr(f.concat(args)) @@ -369,17 +451,31 @@ def concat_ws(separator: str, *args: Expr) -> Expr: """Concatenates the list ``args`` with the separator. ``NULL`` arguments are ignored. ``separator`` should not be ``NULL``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"], "b": ["world"]}) + >>> result = df.select( + ... dfn.functions.concat_ws("-", dfn.col("a"), dfn.col("b")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 'hello-world' """ args = [arg.expr for arg in args] return Expr(f.concat_ws(separator, args)) def order_by(expr: Expr, ascending: bool = True, nulls_first: bool = True) -> SortExpr: - """Creates a new sort expression.""" + """Creates a new sort expression. + + Examples: + >>> sort_expr = dfn.functions.order_by(dfn.col("a"), ascending=False) + >>> sort_expr.ascending() + False + """ return SortExpr(expr, ascending=ascending, nulls_first=nulls_first) -def alias(expr: Expr, name: str, metadata: Optional[dict[str, str]] = None) -> Expr: +def alias(expr: Expr, name: str, metadata: dict[str, str] | None = None) -> Expr: """Creates an alias expression with an optional metadata dictionary. Args: @@ -387,18 +483,30 @@ def alias(expr: Expr, name: str, metadata: Optional[dict[str, str]] = None) -> E name: The alias name metadata: Optional metadata to attach to the column - Returns: - An expression with the given alias + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2]}) + >>> df.select( + ... dfn.functions.alias(dfn.col("a"), "b") + ... ).collect_column("b")[0].as_py() + 1 """ return Expr(f.alias(expr.expr, name, metadata)) def col(name: str) -> Expr: - """Creates a column reference expression.""" + """Creates a column reference expression. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> df.select(dfn.functions.col("a")).collect_column("a")[0].as_py() + 1 + """ return Expr(f.col(name)) -def count_star(filter: Optional[Expr] = None) -> Expr: +def count_star(filter: Expr | None = None) -> Expr: """Create a COUNT(1) aggregate expression. This aggregate function will count all of the rows in the partition. @@ -408,6 +516,13 @@ def count_star(filter: Optional[Expr] = None) -> Expr: Args: filter: If provided, only count rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.count_star().alias("cnt")]) + >>> result.collect_column("cnt")[0].as_py() + 3 """ return count(Expr.literal(1), filter=filter) @@ -418,6 +533,15 @@ def case(expr: Expr) -> CaseBuilder: Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for detailed usage. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.functions.case(dfn.col("a")).when(dfn.lit(1), + ... dfn.lit("one")).otherwise(dfn.lit("other")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 'one' """ return CaseBuilder(f.case(expr.expr)) @@ -428,6 +552,15 @@ def when(when: Expr, then: Expr) -> CaseBuilder: Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for detailed usage. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.functions.when(dfn.col("a") > dfn.lit(2), + ... dfn.lit("big")).otherwise(dfn.lit("small")).alias("c")) + >>> result.collect_column("c")[2].as_py() + 'big' """ return CaseBuilder(f.when(when.expr, then.expr)) @@ -479,10 +612,12 @@ def window( def abs(arg: Expr) -> Expr: """Return the absolute value of a given number. - Returns: - -------- - Expr - A new expression representing the absolute value of the input expression. + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [-1, 0, 1]}) + >>> result = df.select(dfn.functions.abs(dfn.col("a")).alias("abs")) + >>> result.collect_column("abs")[0].as_py() + 1 """ return Expr(f.abs(arg.expr)) @@ -490,127 +625,331 @@ def abs(arg: Expr) -> Expr: def acos(arg: Expr) -> Expr: """Returns the arc cosine or inverse cosine of a number. - Returns: - -------- - Expr - A new expression representing the arc cosine of the input expression. + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.acos(dfn.col("a")).alias("acos")) + >>> result.collect_column("acos")[0].as_py() + 0.0 """ return Expr(f.acos(arg.expr)) def acosh(arg: Expr) -> Expr: - """Returns inverse hyperbolic cosine.""" + """Returns inverse hyperbolic cosine. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.acosh(dfn.col("a")).alias("acosh")) + >>> result.collect_column("acosh")[0].as_py() + 0.0 + """ return Expr(f.acosh(arg.expr)) def ascii(arg: Expr) -> Expr: - """Returns the numeric code of the first character of the argument.""" + """Returns the numeric code of the first character of the argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a","b","c"]}) + >>> ascii_df = df.select(dfn.functions.ascii(dfn.col("a")).alias("ascii")) + >>> ascii_df.collect_column("ascii")[0].as_py() + 97 + """ return Expr(f.ascii(arg.expr)) def asin(arg: Expr) -> Expr: - """Returns the arc sine or inverse sine of a number.""" + """Returns the arc sine or inverse sine of a number. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.asin(dfn.col("a")).alias("asin")) + >>> result.collect_column("asin")[0].as_py() + 0.0 + """ return Expr(f.asin(arg.expr)) def asinh(arg: Expr) -> Expr: - """Returns inverse hyperbolic sine.""" + """Returns inverse hyperbolic sine. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.asinh(dfn.col("a")).alias("asinh")) + >>> result.collect_column("asinh")[0].as_py() + 0.0 + """ return Expr(f.asinh(arg.expr)) def atan(arg: Expr) -> Expr: - """Returns inverse tangent of a number.""" + """Returns inverse tangent of a number. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.atan(dfn.col("a")).alias("atan")) + >>> result.collect_column("atan")[0].as_py() + 0.0 + """ return Expr(f.atan(arg.expr)) def atanh(arg: Expr) -> Expr: - """Returns inverse hyperbolic tangent.""" + """Returns inverse hyperbolic tangent. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.atanh(dfn.col("a")).alias("atanh")) + >>> result.collect_column("atanh")[0].as_py() + 0.0 + """ return Expr(f.atanh(arg.expr)) def atan2(y: Expr, x: Expr) -> Expr: - """Returns inverse tangent of a division given in the argument.""" + """Returns inverse tangent of a division given in the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [0.0], "x": [1.0]}) + >>> result = df.select( + ... dfn.functions.atan2(dfn.col("y"), dfn.col("x")).alias("atan2")) + >>> result.collect_column("atan2")[0].as_py() + 0.0 + """ return Expr(f.atan2(y.expr, x.expr)) def bit_length(arg: Expr) -> Expr: - """Returns the number of bits in the string argument.""" + """Returns the number of bits in the string argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a","b","c"]}) + >>> bit_df = df.select(dfn.functions.bit_length(dfn.col("a")).alias("bit_len")) + >>> bit_df.collect_column("bit_len")[0].as_py() + 8 + """ return Expr(f.bit_length(arg.expr)) def btrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from both sides of a string.""" + """Removes all characters, spaces by default, from both sides of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.btrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + 'a' + """ return Expr(f.btrim(arg.expr)) def cbrt(arg: Expr) -> Expr: - """Returns the cube root of a number.""" + """Returns the cube root of a number. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [27]}) + >>> cbrt_df = df.select(dfn.functions.cbrt(dfn.col("a")).alias("cbrt")) + >>> cbrt_df.collect_column("cbrt")[0].as_py() + 3.0 + """ return Expr(f.cbrt(arg.expr)) def ceil(arg: Expr) -> Expr: - """Returns the nearest integer greater than or equal to argument.""" + """Returns the nearest integer greater than or equal to argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.9]}) + >>> ceil_df = df.select(dfn.functions.ceil(dfn.col("a")).alias("ceil")) + >>> ceil_df.collect_column("ceil")[0].as_py() + 2.0 + """ return Expr(f.ceil(arg.expr)) def character_length(arg: Expr) -> Expr: - """Returns the number of characters in the argument.""" + """Returns the number of characters in the argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abc","b","c"]}) + >>> char_len_df = df.select( + ... dfn.functions.character_length(dfn.col("a")).alias("char_len")) + >>> char_len_df.collect_column("char_len")[0].as_py() + 3 + """ return Expr(f.character_length(arg.expr)) def length(string: Expr) -> Expr: - """The number of characters in the ``string``.""" + """The number of characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.length(string.expr)) def char_length(string: Expr) -> Expr: - """The number of characters in the ``string``.""" + """The number of characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.char_length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.char_length(string.expr)) def chr(arg: Expr) -> Expr: - """Converts the Unicode code point to a UTF8 character.""" + """Converts the Unicode code point to a UTF8 character. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [65]}) + >>> result = df.select(dfn.functions.chr(dfn.col("a")).alias("chr")) + >>> result.collect_column("chr")[0].as_py() + 'A' + """ return Expr(f.chr(arg.expr)) def coalesce(*args: Expr) -> Expr: - """Returns the value of the first expr in ``args`` which is not NULL.""" + """Returns the value of the first expr in ``args`` which is not NULL. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [2, 3]}) + >>> result = df.select( + ... dfn.functions.coalesce(dfn.col("a"), dfn.col("b")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 2 + """ args = [arg.expr for arg in args] return Expr(f.coalesce(*args)) def cos(arg: Expr) -> Expr: - """Returns the cosine of the argument.""" + """Returns the cosine of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,-1,1]}) + >>> cos_df = df.select(dfn.functions.cos(dfn.col("a")).alias("cos")) + >>> cos_df.collect_column("cos")[0].as_py() + 1.0 + """ return Expr(f.cos(arg.expr)) def cosh(arg: Expr) -> Expr: - """Returns the hyperbolic cosine of the argument.""" + """Returns the hyperbolic cosine of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,-1,1]}) + >>> cosh_df = df.select(dfn.functions.cosh(dfn.col("a")).alias("cosh")) + >>> cosh_df.collect_column("cosh")[0].as_py() + 1.0 + """ return Expr(f.cosh(arg.expr)) def cot(arg: Expr) -> Expr: - """Returns the cotangent of the argument.""" + """Returns the cotangent of the argument. + + Examples: + >>> from math import pi + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [pi / 4]}) + >>> result = df.select( + ... dfn.functions.cot(dfn.col("a")).alias("cot") + ... ) + >>> result.collect_column("cot")[0].as_py() + 1.0... + """ return Expr(f.cot(arg.expr)) def degrees(arg: Expr) -> Expr: - """Converts the argument from radians to degrees.""" + """Converts the argument from radians to degrees. + + Examples: + >>> from math import pi + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0,pi,2*pi]}) + >>> deg_df = df.select(dfn.functions.degrees(dfn.col("a")).alias("deg")) + >>> deg_df.collect_column("deg")[2].as_py() + 360.0 + """ return Expr(f.degrees(arg.expr)) def ends_with(arg: Expr, suffix: Expr) -> Expr: - """Returns true if the ``string`` ends with the ``suffix``, false otherwise.""" + """Returns true if the ``string`` ends with the ``suffix``, false otherwise. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abc","b","c"]}) + >>> ends_with_df = df.select( + ... dfn.functions.ends_with(dfn.col("a"), dfn.lit("c")).alias("ends_with")) + >>> ends_with_df.collect_column("ends_with")[0].as_py() + True + """ return Expr(f.ends_with(arg.expr, suffix.expr)) def exp(arg: Expr) -> Expr: - """Returns the exponential of the argument.""" + """Returns the exponential of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.exp(dfn.col("a")).alias("exp")) + >>> result.collect_column("exp")[0].as_py() + 1.0 + """ return Expr(f.exp(arg.expr)) def factorial(arg: Expr) -> Expr: - """Returns the factorial of the argument.""" + """Returns the factorial of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [3]}) + >>> result = df.select( + ... dfn.functions.factorial(dfn.col("a")).alias("factorial") + ... ) + >>> result.collect_column("factorial")[0].as_py() + 6 + """ return Expr(f.factorial(arg.expr)) @@ -621,17 +960,44 @@ def find_in_set(string: Expr, string_list: Expr) -> Expr: ``string_list`` consisting of N substrings. The string list is a string composed of substrings separated by ``,`` characters. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["b"]}) + >>> result = df.select( + ... dfn.functions.find_in_set(dfn.col("a"), dfn.lit("a,b,c")).alias("pos")) + >>> result.collect_column("pos")[0].as_py() + 2 """ return Expr(f.find_in_set(string.expr, string_list.expr)) def floor(arg: Expr) -> Expr: - """Returns the nearest integer less than or equal to the argument.""" + """Returns the nearest integer less than or equal to the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.9]}) + >>> floor_df = df.select(dfn.functions.floor(dfn.col("a")).alias("floor")) + >>> floor_df.collect_column("floor")[0].as_py() + 1.0 + """ return Expr(f.floor(arg.expr)) def gcd(x: Expr, y: Expr) -> Expr: - """Returns the greatest common divisor.""" + """Returns the greatest common divisor. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [12], "b": [8]}) + >>> result = df.select( + ... dfn.functions.gcd(dfn.col("a"), dfn.col("b")).alias("gcd") + ... ) + >>> result.collect_column("gcd")[0].as_py() + 4 + """ return Expr(f.gcd(x.expr, y.expr)) @@ -640,6 +1006,14 @@ def initcap(string: Expr) -> Expr: Converts the first letter of each word in ``string`` to uppercase and the remaining characters to lowercase. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat"]}) + >>> cap_df = df.select(dfn.functions.initcap(dfn.col("a")).alias("cap")) + >>> cap_df.collect_column("cap")[0].as_py() + 'The Cat' """ return Expr(f.initcap(string.expr)) @@ -653,47 +1027,127 @@ def instr(string: Expr, substring: Expr) -> Expr: def iszero(arg: Expr) -> Expr: - """Returns true if a given number is +0.0 or -0.0 otherwise returns false.""" + """Returns true if a given number is +0.0 or -0.0 otherwise returns false. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0, 1.0]}) + >>> result = df.select(dfn.functions.iszero(dfn.col("a")).alias("iz")) + >>> result.collect_column("iz")[0].as_py() + True + """ return Expr(f.iszero(arg.expr)) def lcm(x: Expr, y: Expr) -> Expr: - """Returns the least common multiple.""" + """Returns the least common multiple. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [4], "b": [6]}) + >>> result = df.select( + ... dfn.functions.lcm(dfn.col("a"), dfn.col("b")).alias("lcm") + ... ) + >>> result.collect_column("lcm")[0].as_py() + 12 + """ return Expr(f.lcm(x.expr, y.expr)) def left(string: Expr, n: Expr) -> Expr: - """Returns the first ``n`` characters in the ``string``.""" + """Returns the first ``n`` characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat"]}) + >>> left_df = df.select(dfn.functions.left(dfn.col("a"), dfn.lit(3)).alias("left")) + >>> left_df.collect_column("left")[0].as_py() + 'the' + """ return Expr(f.left(string.expr, n.expr)) def levenshtein(string1: Expr, string2: Expr) -> Expr: - """Returns the Levenshtein distance between the two given strings.""" + """Returns the Levenshtein distance between the two given strings. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["kitten"]}) + >>> result = df.select( + ... dfn.functions.levenshtein(dfn.col("a"), dfn.lit("sitting")).alias("d")) + >>> result.collect_column("d")[0].as_py() + 3 + """ return Expr(f.levenshtein(string1.expr, string2.expr)) def ln(arg: Expr) -> Expr: - """Returns the natural logarithm (base e) of the argument.""" + """Returns the natural logarithm (base e) of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0]}) + >>> result = df.select(dfn.functions.ln(dfn.col("a")).alias("ln")) + >>> result.collect_column("ln")[0].as_py() + 0.0 + """ return Expr(f.ln(arg.expr)) def log(base: Expr, num: Expr) -> Expr: - """Returns the logarithm of a number for a particular ``base``.""" + """Returns the logarithm of a number for a particular ``base``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [100.0]}) + >>> result = df.select( + ... dfn.functions.log(dfn.lit(10.0), dfn.col("a")).alias("log") + ... ) + >>> result.collect_column("log")[0].as_py() + 2.0 + """ return Expr(f.log(base.expr, num.expr)) def log10(arg: Expr) -> Expr: - """Base 10 logarithm of the argument.""" + """Base 10 logarithm of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [100.0]}) + >>> result = df.select(dfn.functions.log10(dfn.col("a")).alias("log10")) + >>> result.collect_column("log10")[0].as_py() + 2.0 + """ return Expr(f.log10(arg.expr)) def log2(arg: Expr) -> Expr: - """Base 2 logarithm of the argument.""" + """Base 2 logarithm of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [8.0]}) + >>> result = df.select(dfn.functions.log2(dfn.col("a")).alias("log2")) + >>> result.collect_column("log2")[0].as_py() + 3.0 + """ return Expr(f.log2(arg.expr)) def lower(arg: Expr) -> Expr: - """Converts a string to lowercase.""" + """Converts a string to lowercase. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["THE CaT"]}) + >>> lower_df = df.select(dfn.functions.lower(dfn.col("a")).alias("lower")) + >>> lower_df.collect_column("lower")[0].as_py() + 'the cat' + """ return Expr(f.lower(arg.expr)) @@ -703,33 +1157,92 @@ def lpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat", "a hat"]}) + >>> lpad_df = df.select(dfn.functions.lpad(dfn.col("a"), dfn.lit(6)).alias("lpad")) + >>> lpad_df.collect_column("lpad")[0].as_py() + 'the ca' + >>> lpad_df.collect_column("lpad")[1].as_py() + ' a hat' """ characters = characters if characters is not None else Expr.literal(" ") return Expr(f.lpad(string.expr, count.expr, characters.expr)) def ltrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from the beginning of a string.""" + """Removes all characters, spaces by default, from the beginning of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.ltrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + 'a ' + """ return Expr(f.ltrim(arg.expr)) def md5(arg: Expr) -> Expr: - """Computes an MD5 128-bit checksum for a string expression.""" + """Computes an MD5 128-bit checksum for a string expression. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.md5(dfn.col("a")).alias("md5")) + >>> result.collect_column("md5")[0].as_py() + '5d41402abc4b2a76b9719d911017c592' + """ return Expr(f.md5(arg.expr)) def nanvl(x: Expr, y: Expr) -> Expr: - """Returns ``x`` if ``x`` is not ``NaN``. Otherwise returns ``y``.""" + """Returns ``x`` if ``x`` is not ``NaN``. Otherwise returns ``y``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [np.nan, 1.0], "b": [0.0, 0.0]}) + >>> nanvl_df = df.select( + ... dfn.functions.nanvl(dfn.col("a"), dfn.col("b")).alias("nanvl")) + >>> nanvl_df.collect_column("nanvl")[0].as_py() + 0.0 + >>> nanvl_df.collect_column("nanvl")[1].as_py() + 1.0 + """ return Expr(f.nanvl(x.expr, y.expr)) def nvl(x: Expr, y: Expr) -> Expr: - """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``.""" + """Returns ``x`` if ``x`` is not ``NULL``. Otherwise returns ``y``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [None, 1], "b": [0, 0]}) + >>> nvl_df = df.select( + ... dfn.functions.nvl(dfn.col("a"), dfn.col("b")).alias("nvl") + ... ) + >>> nvl_df.collect_column("nvl")[0].as_py() + 0 + >>> nvl_df.collect_column("nvl")[1].as_py() + 1 + """ return Expr(f.nvl(x.expr, y.expr)) def octet_length(arg: Expr) -> Expr: - """Returns the number of bytes of a string.""" + """Returns the number of bytes of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.octet_length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.octet_length(arg.expr)) @@ -740,6 +1253,16 @@ def overlay( Replace the substring of string that starts at the ``start``'th character and extends for ``length`` characters with new substring. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abcdef"]}) + >>> result = df.select( + ... dfn.functions.overlay(dfn.col("a"), dfn.lit("XY"), dfn.lit(3), + ... dfn.lit(2)).alias("o")) + >>> result.collect_column("o")[0].as_py() + 'abXYef' """ if length is None: return Expr(f.overlay(string.expr, substring.expr, start.expr)) @@ -747,7 +1270,20 @@ def overlay( def pi() -> Expr: - """Returns an approximate value of π.""" + """Returns an approximate value of π. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> import builtins + >>> result = df.select( + ... dfn.functions.pi().alias("pi") + ... ) + >>> builtins.round( + ... result.collect_column("pi")[0].as_py(), 5 + ... ) + 3.14159 + """ return Expr(f.pi()) @@ -760,7 +1296,17 @@ def position(string: Expr, substring: Expr) -> Expr: def power(base: Expr, exponent: Expr) -> Expr: - """Returns ``base`` raised to the power of ``exponent``.""" + """Returns ``base`` raised to the power of ``exponent``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0]}) + >>> result = df.select( + ... dfn.functions.power(dfn.col("a"), dfn.lit(3.0)).alias("pow") + ... ) + >>> result.collect_column("pow")[0].as_py() + 8.0 + """ return Expr(f.power(base.expr, exponent.expr)) @@ -773,15 +1319,37 @@ def pow(base: Expr, exponent: Expr) -> Expr: def radians(arg: Expr) -> Expr: - """Converts the argument from degrees to radians.""" + """Converts the argument from degrees to radians. + + Examples: + >>> from math import pi + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [180.0]}) + >>> result = df.select( + ... dfn.functions.radians(dfn.col("a")).alias("rad") + ... ) + >>> result.collect_column("rad")[0].as_py() == pi + True + """ return Expr(f.radians(arg.expr)) def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: - """Find if any regular expression (regex) matches exist. + r"""Find if any regular expression (regex) matches exist. Tests a string using a regular expression returning true if at least one match, false otherwise. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello123"]}) + >>> result = df.select( + ... dfn.functions.regexp_like( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + True """ if flags is not None: flags = flags.expr @@ -789,10 +1357,21 @@ def regexp_like(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: - """Perform regular expression (regex) matching. + r"""Perform regular expression (regex) matching. Returns an array with each element containing the leftmost-first match of the corresponding index in ``regex`` to string in ``string``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_match( + ... dfn.col("a"), dfn.lit("(\\d+)") + ... ).alias("m") + ... ) + >>> result.collect_column("m")[0].as_py() + ['42'] """ if flags is not None: flags = flags.expr @@ -802,13 +1381,25 @@ def regexp_match(string: Expr, regex: Expr, flags: Expr | None = None) -> Expr: def regexp_replace( string: Expr, pattern: Expr, replacement: Expr, flags: Expr | None = None ) -> Expr: - """Replaces substring(s) matching a PCRE-like regular expression. + r"""Replaces substring(s) matching a PCRE-like regular expression. The full list of supported features and syntax can be found at Supported flags with the addition of 'g' can be found at + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42"]}) + >>> result = df.select( + ... dfn.functions.regexp_replace( + ... dfn.col("a"), dfn.lit("\\d+"), + ... dfn.lit("XX") + ... ).alias("r") + ... ) + >>> result.collect_column("r")[0].as_py() + 'hello XX' """ if flags is not None: flags = flags.expr @@ -816,36 +1407,128 @@ def regexp_replace( def regexp_count( - string: Expr, pattern: Expr, start: Expr, flags: Expr | None = None + string: Expr, pattern: Expr, start: Expr | None = None, flags: Expr | None = None ) -> Expr: """Returns the number of matches in a string. Optional start position (the first position is 1) to search for the regular expression. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abcabc"]}) + >>> result = df.select( + ... dfn.functions.regexp_count(dfn.col("a"), dfn.lit("abc")).alias("c")) + >>> result.collect_column("c")[0].as_py() + 2 """ if flags is not None: flags = flags.expr - start = start.expr if start is not None else Expr.expr + start = start.expr if start is not None else start return Expr(f.regexp_count(string.expr, pattern.expr, start, flags)) +def regexp_instr( + values: Expr, + regex: Expr, + start: Expr | None = None, + n: Expr | None = None, + flags: Expr | None = None, + sub_expr: Expr | None = None, +) -> Expr: + r"""Returns the position of a regular expression match in a string. + + Args: + values: Data to search for the regular expression match. + regex: Regular expression to search for. + start: Optional position to start the search (the first position is 1). + n: Optional occurrence of the match to find (the first occurrence is 1). + flags: Optional regular expression flags to control regex behavior. + sub_expr: Optionally capture group position instead of the entire match. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello 42 world"]}) + >>> result = df.select( + ... dfn.functions.regexp_instr( + ... dfn.col("a"), dfn.lit("\\d+") + ... ).alias("pos") + ... ) + >>> result.collect_column("pos")[0].as_py() + 7 + """ + start = start.expr if start is not None else None + n = n.expr if n is not None else None + flags = flags.expr if flags is not None else None + sub_expr = sub_expr.expr if sub_expr is not None else None + + return Expr( + f.regexp_instr( + values.expr, + regex.expr, + start, + n, + flags, + sub_expr, + ) + ) + + def repeat(string: Expr, n: Expr) -> Expr: - """Repeats the ``string`` to ``n`` times.""" + """Repeats the ``string`` to ``n`` times. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["ha"]}) + >>> result = df.select(dfn.functions.repeat(dfn.col("a"), dfn.lit(3)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'hahaha' + """ return Expr(f.repeat(string.expr, n.expr)) def replace(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``.""" + """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.replace(dfn.col("a"), dfn.lit("world"), + ... dfn.lit("there")).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'hello there' + """ return Expr(f.replace(string.expr, from_val.expr, to_val.expr)) def reverse(arg: Expr) -> Expr: - """Reverse the string argument.""" + """Reverse the string argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.reverse(dfn.col("a")).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'olleh' + """ return Expr(f.reverse(arg.expr)) def right(string: Expr, n: Expr) -> Expr: - """Returns the last ``n`` characters in the ``string``.""" + """Returns the last ``n`` characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.right(dfn.col("a"), dfn.lit(3)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'llo' + """ return Expr(f.right(string.expr, n.expr)) @@ -855,6 +1538,13 @@ def round(value: Expr, decimal_places: Expr | None = None) -> Expr: If the optional ``decimal_places`` is specified, round to the nearest number of decimal places. You can specify a negative number of decimal places. For example ``round(lit(125.2345), lit(-2))`` would yield a value of ``100.0``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.567]}) + >>> result = df.select(dfn.functions.round(dfn.col("a"), dfn.lit(2)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 1.57 """ if decimal_places is None: decimal_places = Expr.literal(0) @@ -866,48 +1556,130 @@ def rpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hi"]}) + >>> result = df.select( + ... dfn.functions.rpad(dfn.col("a"), dfn.lit(5), dfn.lit("!")).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'hi!!!' """ characters = characters if characters is not None else Expr.literal(" ") return Expr(f.rpad(string.expr, count.expr, characters.expr)) def rtrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from the end of a string.""" + """Removes all characters, spaces by default, from the end of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.rtrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + ' a' + """ return Expr(f.rtrim(arg.expr)) def sha224(arg: Expr) -> Expr: - """Computes the SHA-224 hash of a binary string.""" + """Computes the SHA-224 hash of a binary string. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha224(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + 'ea09ae9cc6768c50fcee903ed054556e5bfc8347907f12598aa24193' + """ return Expr(f.sha224(arg.expr)) def sha256(arg: Expr) -> Expr: - """Computes the SHA-256 hash of a binary string.""" + """Computes the SHA-256 hash of a binary string. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha256(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + '2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824' + """ return Expr(f.sha256(arg.expr)) def sha384(arg: Expr) -> Expr: - """Computes the SHA-384 hash of a binary string.""" + """Computes the SHA-384 hash of a binary string. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha384(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + '59e1748777448c69de6b800d7a33bbfb9ff1b... + """ return Expr(f.sha384(arg.expr)) def sha512(arg: Expr) -> Expr: - """Computes the SHA-512 hash of a binary string.""" + """Computes the SHA-512 hash of a binary string. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.sha512(dfn.col("a")).alias("h") + ... ) + >>> result.collect_column("h")[0].as_py().hex() + '9b71d224bd62f3785d96d46ad3ea3d73319bfb... + """ return Expr(f.sha512(arg.expr)) def signum(arg: Expr) -> Expr: - """Returns the sign of the argument (-1, 0, +1).""" + """Returns the sign of the argument (-1, 0, +1). + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [-5.0, 0.0, 5.0]}) + >>> result = df.select(dfn.functions.signum(dfn.col("a")).alias("s")) + >>> result.collect_column("s").to_pylist() + [-1.0, 0.0, 1.0] + """ return Expr(f.signum(arg.expr)) def sin(arg: Expr) -> Expr: - """Returns the sine of the argument.""" + """Returns the sine of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.sin(dfn.col("a")).alias("sin")) + >>> result.collect_column("sin")[0].as_py() + 0.0 + """ return Expr(f.sin(arg.expr)) def sinh(arg: Expr) -> Expr: - """Returns the hyperbolic sine of the argument.""" + """Returns the hyperbolic sine of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.sinh(dfn.col("a")).alias("sinh")) + >>> result.collect_column("sinh")[0].as_py() + 0.0 + """ return Expr(f.sinh(arg.expr)) @@ -916,27 +1688,73 @@ def split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr: Splits a string based on a delimiter and picks out the desired field based on the index. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a,b,c"]}) + >>> result = df.select( + ... dfn.functions.split_part(dfn.col("a"), dfn.lit(","), dfn.lit(2)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'b' """ return Expr(f.split_part(string.expr, delimiter.expr, index.expr)) def sqrt(arg: Expr) -> Expr: - """Returns the square root of the argument.""" + """Returns the square root of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [9.0]}) + >>> result = df.select(dfn.functions.sqrt(dfn.col("a")).alias("sqrt")) + >>> result.collect_column("sqrt")[0].as_py() + 3.0 + """ return Expr(f.sqrt(arg.expr)) def starts_with(string: Expr, prefix: Expr) -> Expr: - """Returns true if string starts with prefix.""" + """Returns true if string starts with prefix. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello_from_datafusion"]}) + >>> result = df.select( + ... dfn.functions.starts_with(dfn.col("a"), dfn.lit("hello")).alias("sw")) + >>> result.collect_column("sw")[0].as_py() + True + """ return Expr(f.starts_with(string.expr, prefix.expr)) def strpos(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the ``substring`` matches the ``string``.""" + """Finds the position from where the ``substring`` matches the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.strpos(dfn.col("a"), dfn.lit("llo")).alias("pos")) + >>> result.collect_column("pos")[0].as_py() + 3 + """ return Expr(f.strpos(string.expr, substring.expr)) def substr(string: Expr, position: Expr) -> Expr: - """Substring from the ``position`` to the end.""" + """Substring from the ``position`` to the end. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.substr(dfn.col("a"), dfn.lit(3)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'llo' + """ return Expr(f.substr(string.expr, position.expr)) @@ -945,27 +1763,72 @@ def substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr: The return will be the ``string`` from before ``count`` occurrences of ``delimiter``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a.b.c"]}) + >>> result = df.select( + ... dfn.functions.substr_index(dfn.col("a"), dfn.lit("."), + ... dfn.lit(2)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'a.b' """ return Expr(f.substr_index(string.expr, delimiter.expr, count.expr)) def substring(string: Expr, position: Expr, length: Expr) -> Expr: - """Substring from the ``position`` with ``length`` characters.""" + """Substring from the ``position`` with ``length`` characters. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.substring(dfn.col("a"), dfn.lit(1), dfn.lit(5)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'hello' + """ return Expr(f.substring(string.expr, position.expr, length.expr)) def tan(arg: Expr) -> Expr: - """Returns the tangent of the argument.""" + """Returns the tangent of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.tan(dfn.col("a")).alias("tan")) + >>> result.collect_column("tan")[0].as_py() + 0.0 + """ return Expr(f.tan(arg.expr)) def tanh(arg: Expr) -> Expr: - """Returns the hyperbolic tangent of the argument.""" + """Returns the hyperbolic tangent of the argument. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0]}) + >>> result = df.select(dfn.functions.tanh(dfn.col("a")).alias("tanh")) + >>> result.collect_column("tanh")[0].as_py() + 0.0 + """ return Expr(f.tanh(arg.expr)) def to_hex(arg: Expr) -> Expr: - """Converts an integer to a hexadecimal string.""" + """Converts an integer to a hexadecimal string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [255]}) + >>> result = df.select(dfn.functions.to_hex(dfn.col("a")).alias("hex")) + >>> result.collect_column("hex")[0].as_py() + 'ff' + """ return Expr(f.to_hex(arg.expr)) @@ -973,73 +1836,217 @@ def now() -> Expr: """Returns the current timestamp in nanoseconds. This will use the same value for all instances of now() in same statement. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.now().alias("now") + ... ) + + Use .value instead of .as_py() because nanosecond timestamps + require pandas to convert to Python datetime objects. + + >>> result.collect_column("now")[0].value > 0 + True """ return Expr(f.now()) +def to_char(arg: Expr, formatter: Expr) -> Expr: + """Returns a string representation of a date, time, timestamp or duration. + + For usage of ``formatter`` see the rust chrono package ``strftime`` package. + + [Documentation here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + """ + return Expr(f.to_char(arg.expr, formatter.expr)) + + +def _unwrap_exprs(args: tuple[Expr, ...]) -> list: + return [arg.expr for arg in args] + + +def to_date(arg: Expr, *formatters: Expr) -> Expr: + """Converts a value to a date (YYYY-MM-DD). + + Supports strings, numeric and timestamp types as input. + Integers and doubles are interpreted as days since the unix epoch. + Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') + if ``formatters`` are not provided. + + For usage of ``formatters`` see the rust chrono package ``strftime`` package. + + [Documentation here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + """ + return Expr(f.to_date(arg.expr, *_unwrap_exprs(formatters))) + + +def to_local_time(*args: Expr) -> Expr: + """Converts a timestamp with a timezone to a timestamp without a timezone. + + This function handles daylight saving time changes. + """ + return Expr(f.to_local_time(*_unwrap_exprs(args))) + + +def to_time(arg: Expr, *formatters: Expr) -> Expr: + """Converts a value to a time. Supports strings and timestamps as input. + + If ``formatters`` is not provided strings are parsed as HH:MM:SS, HH:MM or + HH:MM:SS.nnnnnnnnn; + + For usage of ``formatters`` see the rust chrono package ``strftime`` package. + + [Documentation here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) + """ + return Expr(f.to_time(arg.expr, *_unwrap_exprs(formatters))) + + def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. For usage of ``formatters`` see the rust chrono package ``strftime`` package. [Documentation here.](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) - """ - if formatters is None: - return f.to_timestamp(arg.expr) - formatters = [f.expr for f in formatters] - return Expr(f.to_timestamp(arg.expr, *formatters)) + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' + """ + return Expr(f.to_timestamp(arg.expr, *_unwrap_exprs(formatters))) def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in milliseconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_millis( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ - formatters = [f.expr for f in formatters] - return Expr(f.to_timestamp_millis(arg.expr, *formatters)) + return Expr(f.to_timestamp_millis(arg.expr, *_unwrap_exprs(formatters))) def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in microseconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_micros( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ - formatters = [f.expr for f in formatters] - return Expr(f.to_timestamp_micros(arg.expr, *formatters)) + return Expr(f.to_timestamp_micros(arg.expr, *_unwrap_exprs(formatters))) def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_nanos( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ - formatters = [f.expr for f in formatters] - return Expr(f.to_timestamp_nanos(arg.expr, *formatters)) + return Expr(f.to_timestamp_nanos(arg.expr, *_unwrap_exprs(formatters))) def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in seconds. See :py:func:`to_timestamp` for a description on how to use formatters. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-01-01T00:00:00"]}) + >>> result = df.select( + ... dfn.functions.to_timestamp_seconds( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '2021-01-01 00:00:00' """ - formatters = [f.expr for f in formatters] - return Expr(f.to_timestamp_seconds(arg.expr, *formatters)) + return Expr(f.to_timestamp_seconds(arg.expr, *_unwrap_exprs(formatters))) def to_unixtime(string: Expr, *format_arguments: Expr) -> Expr: - """Converts a string and optional formats to a Unixtime.""" - args = [f.expr for f in format_arguments] - return Expr(f.to_unixtime(string.expr, *args)) + """Converts a string and optional formats to a Unixtime. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["1970-01-01T00:00:00"]}) + >>> result = df.select(dfn.functions.to_unixtime(dfn.col("a")).alias("u")) + >>> result.collect_column("u")[0].as_py() + 0 + """ + return Expr(f.to_unixtime(string.expr, *_unwrap_exprs(format_arguments))) def current_date() -> Expr: - """Returns current UTC date as a Date32 value.""" + """Returns current UTC date as a Date32 value. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.current_date().alias("d") + ... ) + >>> result.collect_column("d")[0].as_py() is not None + True + """ return Expr(f.current_date()) +today = current_date + + def current_time() -> Expr: - """Returns current UTC time as a Time64 value.""" + """Returns current UTC time as a Time64 value. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.current_time().alias("t") + ... ) + + Use .value instead of .as_py() because nanosecond timestamps + require pandas to convert to Python datetime objects. + + >>> result.collect_column("t")[0].value > 0 + True + """ return Expr(f.current_time()) @@ -1052,7 +2059,17 @@ def datepart(part: Expr, date: Expr) -> Expr: def date_part(part: Expr, date: Expr) -> Expr: - """Extracts a subfield from the date.""" + """Extracts a subfield from the date. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-07-15T00:00:00"]}) + >>> df = df.select(dfn.functions.to_timestamp(dfn.col("a")).alias("a")) + >>> result = df.select( + ... dfn.functions.date_part(dfn.lit("year"), dfn.col("a")).alias("y")) + >>> result.collect_column("y")[0].as_py() + 2021 + """ return Expr(f.date_part(part.expr, date.expr)) @@ -1065,7 +2082,20 @@ def extract(part: Expr, date: Expr) -> Expr: def date_trunc(part: Expr, date: Expr) -> Expr: - """Truncates the date to a specified level of precision.""" + """Truncates the date to a specified level of precision. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["2021-07-15T12:34:56"]}) + >>> df = df.select(dfn.functions.to_timestamp(dfn.col("a")).alias("a")) + >>> result = df.select( + ... dfn.functions.date_trunc( + ... dfn.lit("month"), dfn.col("a") + ... ).alias("t") + ... ) + >>> str(result.collect_column("t")[0].as_py()) + '2021-07-01 00:00:00' + """ return Expr(f.date_trunc(part.expr, date.expr)) @@ -1078,39 +2108,113 @@ def datetrunc(part: Expr, date: Expr) -> Expr: def date_bin(stride: Expr, source: Expr, origin: Expr) -> Expr: - """Coerces an arbitrary timestamp to the start of the nearest specified interval.""" + """Coerces an arbitrary timestamp to the start of the nearest specified interval. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"timestamp": ['2021-07-15 12:34:56', '2021-01-01']}) + >>> result = df.select( + ... dfn.functions.date_bin( + ... dfn.string_literal("15 minutes"), + ... dfn.col("timestamp"), + ... dfn.string_literal("2001-01-01 00:00:00") + ... ).alias("b") + ... ) + >>> str(result.collect_column("b")[0].as_py()) + '2021-07-15 12:30:00' + >>> str(result.collect_column("b")[1].as_py()) + '2021-01-01 00:00:00' + """ return Expr(f.date_bin(stride.expr, source.expr, origin.expr)) def make_date(year: Expr, month: Expr, day: Expr) -> Expr: - """Make a date from year, month and day component parts.""" + """Make a date from year, month and day component parts. + + Examples: + >>> from datetime import date + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2024], "m": [1], "d": [15]}) + >>> result = df.select( + ... dfn.functions.make_date(dfn.col("y"), dfn.col("m"), + ... dfn.col("d")).alias("dt")) + >>> result.collect_column("dt")[0].as_py() + datetime.date(2024, 1, 15) + """ return Expr(f.make_date(year.expr, month.expr, day.expr)) def translate(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces the characters in ``from_val`` with the counterpart in ``to_val``.""" + """Replaces the characters in ``from_val`` with the counterpart in ``to_val``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.translate(dfn.col("a"), dfn.lit("helo"), + ... dfn.lit("HELO")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'HELLO' + """ return Expr(f.translate(string.expr, from_val.expr, to_val.expr)) def trim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from both sides of a string.""" + """Removes all characters, spaces by default, from both sides of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" hello "]}) + >>> result = df.select(dfn.functions.trim(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'hello' + """ return Expr(f.trim(arg.expr)) def trunc(num: Expr, precision: Expr | None = None) -> Expr: - """Truncate the number toward zero with optional precision.""" + """Truncate the number toward zero with optional precision. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.567]}) + >>> result = df.select(dfn.functions.trunc(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 1.0 + """ if precision is not None: return Expr(f.trunc(num.expr, precision.expr)) return Expr(f.trunc(num.expr)) def upper(arg: Expr) -> Expr: - """Converts a string to uppercase.""" + """Converts a string to uppercase. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.upper(dfn.col("a")).alias("u")) + >>> result.collect_column("u")[0].as_py() + 'HELLO' + """ return Expr(f.upper(arg.expr)) def make_array(*args: Expr) -> Expr: - """Returns an array using the specified input expressions.""" + """Returns an array using the specified input expressions. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.make_array(dfn.lit(1), dfn.lit(2), dfn.lit(3)).alias("arr")) + >>> result.collect_column("arr")[0].as_py() + [1, 2, 3] + """ args = [arg.expr for arg in args] return Expr(f.make_array(args)) @@ -1132,23 +2236,71 @@ def array(*args: Expr) -> Expr: def range(start: Expr, stop: Expr, step: Expr) -> Expr: - """Create a list of values in the range between start and stop.""" + """Create a list of values in the range between start and stop. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.range(dfn.lit(0), dfn.lit(5), dfn.lit(2)).alias("r")) + >>> result.collect_column("r")[0].as_py() + [0, 2, 4] + """ return Expr(f.range(start.expr, stop.expr, step.expr)) def uuid() -> Expr: - """Returns uuid v4 as a string value.""" + """Returns uuid v4 as a string value. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.uuid().alias("u") + ... ) + >>> len(result.collect_column("u")[0].as_py()) == 36 + True + """ return Expr(f.uuid()) def struct(*args: Expr) -> Expr: - """Returns a struct with the given arguments.""" + """Returns a struct with the given arguments. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1], "b": [2]}) + >>> result = df.select( + ... dfn.functions.struct( + ... dfn.col("a"), dfn.col("b") + ... ).alias("s") + ... ) + + Children in the new struct will always be `c0`, ..., `cN-1` + for `N` children. + + >>> result.collect_column("s")[0].as_py() == {"c0": 1, "c1": 2} + True + """ args = [arg.expr for arg in args] return Expr(f.struct(*args)) def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: - """Returns a struct with the given names and arguments pairs.""" + """Returns a struct with the given names and arguments pairs. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.named_struct( + ... [("x", dfn.lit(10)), ("y", dfn.lit(20))] + ... ).alias("s") + ... ) + >>> result.collect_column("s")[0].as_py() == {"x": 10, "y": 20} + True + """ name_pair_exprs = [ [Expr.literal(pa.scalar(pair[0], type=pa.string())), pair[1]] for pair in name_pairs @@ -1160,27 +2312,79 @@ def named_struct(name_pairs: list[tuple[str, Expr]]) -> Expr: def from_unixtime(arg: Expr) -> Expr: - """Converts an integer to RFC3339 timestamp format string.""" + """Converts an integer to RFC3339 timestamp format string. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0]}) + >>> result = df.select( + ... dfn.functions.from_unixtime( + ... dfn.col("a") + ... ).alias("ts") + ... ) + >>> str(result.collect_column("ts")[0].as_py()) + '1970-01-01 00:00:00' + """ return Expr(f.from_unixtime(arg.expr)) def arrow_typeof(arg: Expr) -> Expr: - """Returns the Arrow type of the expression.""" + """Returns the Arrow type of the expression. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select(dfn.functions.arrow_typeof(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'Int64' + """ return Expr(f.arrow_typeof(arg.expr)) def arrow_cast(expr: Expr, data_type: Expr) -> Expr: - """Casts an expression to a specified data type.""" + """Casts an expression to a specified data type. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> data_type = dfn.string_literal("Float64") + >>> result = df.select( + ... dfn.functions.arrow_cast(dfn.col("a"), data_type).alias("c") + ... ) + >>> result.collect_column("c")[0].as_py() + 1.0 + """ return Expr(f.arrow_cast(expr.expr, data_type.expr)) def random() -> Expr: - """Returns a random value in the range ``0.0 <= x < 1.0``.""" + """Returns a random value in the range ``0.0 <= x < 1.0``. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.random().alias("r") + ... ) + >>> val = result.collect_column("r")[0].as_py() + >>> 0.0 <= val < 1.0 + True + """ return Expr(f.random()) def array_append(array: Expr, element: Expr) -> Expr: - """Appends an element to the end of an array.""" + """Appends an element to the end of an array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_append(dfn.col("a"), dfn.lit(4)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] + """ return Expr(f.array_append(array.expr, element.expr)) @@ -1209,7 +2413,17 @@ def list_push_back(array: Expr, element: Expr) -> Expr: def array_concat(*args: Expr) -> Expr: - """Concatenates the input arrays.""" + """Concatenates the input arrays. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]], "b": [[3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_concat(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] + """ args = [arg.expr for arg in args] return Expr(f.array_concat(args)) @@ -1223,12 +2437,36 @@ def array_cat(*args: Expr) -> Expr: def array_dims(array: Expr) -> Expr: - """Returns an array of the array's dimensions.""" + """Returns an array of the array's dimensions. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_dims(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [3] + """ return Expr(f.array_dims(array.expr)) def array_distinct(array: Expr) -> Expr: - """Returns distinct values from the array after removing duplicates.""" + """Returns distinct values from the array after removing duplicates. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_distinct( + ... dfn.col("a") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [1, 2, 3] + """ return Expr(f.array_distinct(array.expr)) @@ -1265,12 +2503,31 @@ def list_dims(array: Expr) -> Expr: def array_element(array: Expr, n: Expr) -> Expr: - """Extracts the element with the index n from the array.""" + """Extracts the element with the index n from the array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.array_element(dfn.col("a"), dfn.lit(2)).alias("result")) + >>> result.collect_column("result")[0].as_py() + 20 + """ return Expr(f.array_element(array.expr, n.expr)) def array_empty(array: Expr) -> Expr: - """Returns a boolean indicating whether the array is empty.""" + """Returns a boolean indicating whether the array is empty. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select(dfn.functions.array_empty(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + False + """ return Expr(f.array_empty(array.expr)) @@ -1299,7 +2556,16 @@ def list_extract(array: Expr, n: Expr) -> Expr: def array_length(array: Expr) -> Expr: - """Returns the length of the array.""" + """Returns the length of the array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_length(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 3 + """ return Expr(f.array_length(array.expr)) @@ -1312,7 +2578,17 @@ def list_length(array: Expr) -> Expr: def array_has(first_array: Expr, second_array: Expr) -> Expr: - """Returns true if the element appears in the first array, otherwise false.""" + """Returns true if the element appears in the first array, otherwise false. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select( + ... dfn.functions.array_has(dfn.col("a"), dfn.lit(2)).alias("result")) + >>> result.collect_column("result")[0].as_py() + True + """ return Expr(f.array_has(first_array.expr, second_array.expr)) @@ -1321,6 +2597,15 @@ def array_has_all(first_array: Expr, second_array: Expr) -> Expr: Returns true if each element of the second array appears in the first array. Otherwise, it returns false. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_has_all(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result.collect_column("result")[0].as_py() + True """ return Expr(f.array_has_all(first_array.expr, second_array.expr)) @@ -1330,12 +2615,31 @@ def array_has_any(first_array: Expr, second_array: Expr) -> Expr: Returns true if at least one element of the second array appears in the first array. Otherwise, it returns false. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 5]]}) + >>> result = df.select( + ... dfn.functions.array_has_any(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result.collect_column("result")[0].as_py() + True """ return Expr(f.array_has_any(first_array.expr, second_array.expr)) def array_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: - """Return the position of the first occurrence of ``element`` in ``array``.""" + """Return the position of the first occurrence of ``element`` in ``array``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[10, 20, 30]]}) + >>> result = df.select( + ... dfn.functions.array_position(dfn.col("a"), dfn.lit(20)).alias("result")) + >>> result.collect_column("result")[0].as_py() + 2 + """ return Expr(f.array_position(array.expr, element.expr, index)) @@ -1364,7 +2668,17 @@ def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: def array_positions(array: Expr, element: Expr) -> Expr: - """Searches for an element in the array and returns all occurrences.""" + """Searches for an element in the array and returns all occurrences. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_positions(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 3] + """ return Expr(f.array_positions(array.expr, element.expr)) @@ -1377,7 +2691,16 @@ def list_positions(array: Expr, element: Expr) -> Expr: def array_ndims(array: Expr) -> Expr: - """Returns the number of dimensions of the array.""" + """Returns the number of dimensions of the array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_ndims(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 1 + """ return Expr(f.array_ndims(array.expr)) @@ -1390,7 +2713,17 @@ def list_ndims(array: Expr) -> Expr: def array_prepend(element: Expr, array: Expr) -> Expr: - """Prepends an element to the beginning of an array.""" + """Prepends an element to the beginning of an array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_prepend(dfn.lit(0), dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [0, 1, 2] + """ return Expr(f.array_prepend(element.expr, array.expr)) @@ -1419,17 +2752,45 @@ def list_push_front(element: Expr, array: Expr) -> Expr: def array_pop_back(array: Expr) -> Expr: - """Returns the array without the last element.""" + """Returns the array without the last element. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_pop_back(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2] + """ return Expr(f.array_pop_back(array.expr)) def array_pop_front(array: Expr) -> Expr: - """Returns the array without the first element.""" + """Returns the array without the first element. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.array_pop_front(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [2, 3] + """ return Expr(f.array_pop_front(array.expr)) def array_remove(array: Expr, element: Expr) -> Expr: - """Removes the first element from the array equal to the given value.""" + """Removes the first element from the array equal to the given value. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_remove(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [2, 1] + """ return Expr(f.array_remove(array.expr, element.expr)) @@ -1442,7 +2803,18 @@ def list_remove(array: Expr, element: Expr) -> Expr: def array_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: - """Removes the first ``max`` elements from the array equal to the given value.""" + """Removes the first ``max`` elements from the array equal to the given value. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]}) + >>> result = df.select( + ... dfn.functions.array_remove_n(dfn.col("a"), dfn.lit(1), + ... dfn.lit(2)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [2, 1] + """ return Expr(f.array_remove_n(array.expr, element.expr, max.expr)) @@ -1455,7 +2827,17 @@ def list_remove_n(array: Expr, element: Expr, max: Expr) -> Expr: def array_remove_all(array: Expr, element: Expr) -> Expr: - """Removes all elements from the array equal to the given value.""" + """Removes all elements from the array equal to the given value. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_remove_all(dfn.col("a"), dfn.lit(1)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [2] + """ return Expr(f.array_remove_all(array.expr, element.expr)) @@ -1468,7 +2850,17 @@ def list_remove_all(array: Expr, element: Expr) -> Expr: def array_repeat(element: Expr, count: Expr) -> Expr: - """Returns an array containing ``element`` ``count`` times.""" + """Returns an array containing ``element`` ``count`` times. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.array_repeat(dfn.lit(3), dfn.lit(3)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [3, 3, 3] + """ return Expr(f.array_repeat(element.expr, count.expr)) @@ -1481,7 +2873,18 @@ def list_repeat(element: Expr, count: Expr) -> Expr: def array_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces the first occurrence of ``from_val`` with ``to_val``.""" + """Replaces the first occurrence of ``from_val`` with ``to_val``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_replace(dfn.col("a"), dfn.lit(1), + ... dfn.lit(9)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [9, 2, 1] + """ return Expr(f.array_replace(array.expr, from_val.expr, to_val.expr)) @@ -1498,6 +2901,16 @@ def array_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Exp Replaces the first ``max`` occurrences of the specified element with another specified element. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1, 1]]}) + >>> result = df.select( + ... dfn.functions.array_replace_n(dfn.col("a"), dfn.lit(1), dfn.lit(9), + ... dfn.lit(2)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [9, 2, 9, 1] """ return Expr(f.array_replace_n(array.expr, from_val.expr, to_val.expr, max.expr)) @@ -1514,7 +2927,18 @@ def list_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr) -> Expr def array_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces all occurrences of ``from_val`` with ``to_val``.""" + """Replaces all occurrences of ``from_val`` with ``to_val``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 1]]}) + >>> result = df.select( + ... dfn.functions.array_replace_all(dfn.col("a"), dfn.lit(1), + ... dfn.lit(9)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [9, 2, 9] + """ return Expr(f.array_replace_all(array.expr, from_val.expr, to_val.expr)) @@ -1533,6 +2957,14 @@ def array_sort(array: Expr, descending: bool = False, null_first: bool = False) array: The input array to sort. descending: If True, sorts in descending order. null_first: If True, nulls will be returned at the beginning of the array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[3, 1, 2]]}) + >>> result = df.select(dfn.functions.array_sort(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3] """ desc = "DESC" if descending else "ASC" nulls_first = "NULLS FIRST" if null_first else "NULLS LAST" @@ -1553,7 +2985,18 @@ def list_sort(array: Expr, descending: bool = False, null_first: bool = False) - def array_slice( array: Expr, begin: Expr, end: Expr, stride: Expr | None = None ) -> Expr: - """Returns a slice of the array.""" + """Returns a slice of the array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_slice(dfn.col("a"), dfn.lit(2), + ... dfn.lit(3)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [2, 3] + """ if stride is not None: stride = stride.expr return Expr(f.array_slice(array.expr, begin.expr, end.expr, stride)) @@ -1568,7 +3011,22 @@ def list_slice(array: Expr, begin: Expr, end: Expr, stride: Expr | None = None) def array_intersect(array1: Expr, array2: Expr) -> Expr: - """Returns the intersection of ``array1`` and ``array2``.""" + """Returns the intersection of ``array1`` and ``array2``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_intersect( + ... dfn.col("a"), dfn.col("b") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [2, 3] + """ return Expr(f.array_intersect(array1.expr, array2.expr)) @@ -1584,6 +3042,20 @@ def array_union(array1: Expr, array2: Expr) -> Expr: """Returns an array of the elements in the union of array1 and array2. Duplicate rows will not be returned. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_union( + ... dfn.col("a"), dfn.col("b") + ... ).alias("result") + ... ) + >>> sorted( + ... result.collect_column("result")[0].as_py() + ... ) + [1, 2, 3, 4] """ return Expr(f.array_union(array1.expr, array2.expr)) @@ -1599,7 +3071,17 @@ def list_union(array1: Expr, array2: Expr) -> Expr: def array_except(array1: Expr, array2: Expr) -> Expr: - """Returns the elements that appear in ``array1`` but not in ``array2``.""" + """Returns the elements that appear in ``array1`` but not in ``array2``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[2, 3, 4]]}) + >>> result = df.select( + ... dfn.functions.array_except(dfn.col("a"), dfn.col("b")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1] + """ return Expr(f.array_except(array1.expr, array2.expr)) @@ -1616,6 +3098,16 @@ def array_resize(array: Expr, size: Expr, value: Expr) -> Expr: If ``size`` is greater than the ``array`` length, the additional entries will be filled with the given ``value``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2]]}) + >>> result = df.select( + ... dfn.functions.array_resize(dfn.col("a"), dfn.lit(4), + ... dfn.lit(0)).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 0, 0] """ return Expr(f.array_resize(array.expr, size.expr, value.expr)) @@ -1630,12 +3122,30 @@ def list_resize(array: Expr, size: Expr, value: Expr) -> Expr: def flatten(array: Expr) -> Expr: - """Flattens an array of arrays into a single array.""" + """Flattens an array of arrays into a single array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[[1, 2], [3, 4]]]}) + >>> result = df.select(dfn.functions.flatten(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + [1, 2, 3, 4] + """ return Expr(f.flatten(array.expr)) def cardinality(array: Expr) -> Expr: - """Returns the total number of elements in the array.""" + """Returns the total number of elements in the array. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [[1, 2, 3]]}) + >>> result = df.select(dfn.functions.cardinality(dfn.col("a")).alias("result")) + >>> result.collect_column("result")[0].as_py() + 3 + """ return Expr(f.cardinality(array.expr)) @@ -1647,7 +3157,7 @@ def empty(array: Expr) -> Expr: # aggregate functions def approx_distinct( expression: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Returns the approximate number of distinct values. @@ -1661,13 +3171,22 @@ def approx_distinct( Args: expression: Values to check for distinct entries filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 1, 2, 3]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() == 3 + True """ filter_raw = filter.expr if filter is not None else None return Expr(f.approx_distinct(expression.expr, filter=filter_raw)) -def approx_median(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def approx_median(expression: Expr, filter: Expr | None = None) -> Expr: """Returns the approximate median value. This aggregate function is similar to :py:func:`median`, but it will only @@ -1679,6 +3198,15 @@ def approx_median(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: Values to find the median for filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_median(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.approx_median(expression.expr, filter=filter_raw)) @@ -1687,8 +3215,8 @@ def approx_median(expression: Expr, filter: Optional[Expr] = None) -> Expr: def approx_percentile_cont( sort_expression: Expr | SortExpr, percentile: float, - num_centroids: Optional[int] = None, - filter: Optional[Expr] = None, + num_centroids: int | None = None, + filter: Expr | None = None, ) -> Expr: """Returns the value that is approximately at a given percentile of ``expr``. @@ -1710,6 +3238,15 @@ def approx_percentile_cont( percentile: This must be between 0.0 and 1.0, inclusive num_centroids: Max bin size for the t-digest algorithm filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_percentile_cont(dfn.col("a"), 0.5).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3.0 """ sort_expr_raw = sort_or_default(sort_expression) filter_raw = filter.expr if filter is not None else None @@ -1724,8 +3261,8 @@ def approx_percentile_cont_with_weight( sort_expression: Expr | SortExpr, weight: Expr, percentile: float, - num_centroids: Optional[int] = None, - filter: Optional[Expr] = None, + num_centroids: int | None = None, + filter: Expr | None = None, ) -> Expr: """Returns the value of the weighted approximate percentile. @@ -1742,6 +3279,15 @@ def approx_percentile_cont_with_weight( num_centroids: Max bin size for the t-digest algorithm filter: If provided, only compute against rows for which the filter is True + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"), + ... dfn.col("w"), 0.5).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ sort_expr_raw = sort_or_default(sort_expression) filter_raw = filter.expr if filter is not None else None @@ -1759,8 +3305,8 @@ def approx_percentile_cont_with_weight( def array_agg( expression: Expr, distinct: bool = False, - filter: Optional[Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + filter: Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Aggregate values into an array. @@ -1779,7 +3325,15 @@ def array_agg( For example:: - df.select(array_agg(col("a"), order_by="b")) + df.aggregate([], array_agg(col("a"), order_by="b")) + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.array_agg(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + [1, 2, 3] """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -1793,7 +3347,7 @@ def array_agg( def avg( expression: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Returns the average value. @@ -1805,12 +3359,20 @@ def avg( Args: expression: Values to combine into an array filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.avg(expression.expr, filter=filter_raw)) -def corr(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: +def corr(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: """Returns the correlation coefficient between ``value1`` and ``value2``. This aggregate function expects both values to be numeric and will return a float. @@ -1822,6 +3384,14 @@ def corr(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: value_y: The dependent variable for correlation value_x: The independent variable for correlation filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.corr(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.corr(value_y.expr, value_x.expr, filter=filter_raw)) @@ -1830,7 +3400,7 @@ def corr(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: def count( expressions: Expr | list[Expr] | None = None, distinct: bool = False, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Returns the number of rows that match the given arguments. @@ -1843,6 +3413,14 @@ def count( expressions: Argument to perform bitwise calculation on distinct: If True, a single entry for each distinct value will be in the result filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.count(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None @@ -1856,7 +3434,7 @@ def count( return Expr(f.count(*args, distinct=distinct, filter=filter_raw)) -def covar_pop(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: +def covar_pop(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: """Computes the population covariance. This aggregate function expects both values to be numeric and will return a float. @@ -1868,12 +3446,24 @@ def covar_pop(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Ex value_y: The dependent variable for covariance value_x: The independent variable for covariance filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 5.0, 10.0], "b": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], + ... [dfn.functions.covar_pop( + ... dfn.col("a"), dfn.col("b") + ... ).alias("v")] + ... ) + >>> result.collect_column("v")[0].as_py() + 3.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.covar_pop(value_y.expr, value_x.expr, filter=filter_raw)) -def covar_samp(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: +def covar_samp(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: """Computes the sample covariance. This aggregate function expects both values to be numeric and will return a float. @@ -1885,20 +3475,29 @@ def covar_samp(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> E value_y: The dependent variable for covariance value_x: The independent variable for covariance filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.covar_samp(dfn.col("a"), dfn.col("b")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.covar_samp(value_y.expr, value_x.expr, filter=filter_raw)) -def covar(value_y: Expr, value_x: Expr, filter: Optional[Expr] = None) -> Expr: +def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: """Computes the sample covariance. - This is an alias for :py:func:`covar_samp`. + See Also: + This is an alias for :py:func:`covar_samp`. """ return covar_samp(value_y, value_x, filter) -def max(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def max(expression: Expr, filter: Expr | None = None) -> Expr: """Aggregate function that returns the maximum value of the argument. If using the builder functions described in ref:`_aggregation` this function ignores @@ -1907,21 +3506,37 @@ def max(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: The value to find the maximum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.max(expression.expr, filter=filter_raw)) -def mean(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def mean(expression: Expr, filter: Expr | None = None) -> Expr: """Returns the average (mean) value of the argument. This is an alias for :py:func:`avg`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.mean(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ return avg(expression, filter) def median( - expression: Expr, distinct: bool = False, filter: Optional[Expr] = None + expression: Expr, distinct: bool = False, filter: Expr | None = None ) -> Expr: """Computes the median of a set of numbers. @@ -1935,13 +3550,21 @@ def median( expression: The value to compute the median of distinct: If True, a single entry for each distinct value will be in the result filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.median(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.median(expression.expr, distinct=distinct, filter=filter_raw)) -def min(expression: Expr, filter: Optional[Expr] = None) -> Expr: - """Returns the minimum value of the argument. +def min(expression: Expr, filter: Expr | None = None) -> Expr: + """Aggregate function that returns the minimum value of the argument. If using the builder functions described in ref:`_aggregation` this function ignores the options ``order_by``, ``null_treatment``, and ``distinct``. @@ -1949,6 +3572,14 @@ def min(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1 """ filter_raw = filter.expr if filter is not None else None return Expr(f.min(expression.expr, filter=filter_raw)) @@ -1956,7 +3587,7 @@ def min(expression: Expr, filter: Optional[Expr] = None) -> Expr: def sum( expression: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the sum of a set of numbers. @@ -1968,12 +3599,20 @@ def sum( Args: expression: Values to combine into an array filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 6 """ filter_raw = filter.expr if filter is not None else None return Expr(f.sum(expression.expr, filter=filter_raw)) -def stddev(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def stddev(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the standard deviation of the argument. If using the builder functions described in ref:`_aggregation` this function ignores @@ -1982,12 +3621,19 @@ def stddev(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) + >>> result = df.aggregate([], [dfn.functions.stddev(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.stddev(expression.expr, filter=filter_raw)) -def stddev_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def stddev_pop(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the population standard deviation of the argument. If using the builder functions described in ref:`_aggregation` this function ignores @@ -1996,28 +3642,53 @@ def stddev_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: The value to find the minimum of filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.stddev_pop(dfn.col("a")).alias("v")] + ... ) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.stddev_pop(expression.expr, filter=filter_raw)) -def stddev_samp(arg: Expr, filter: Optional[Expr] = None) -> Expr: +def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr: """Computes the sample standard deviation of the argument. This is an alias for :py:func:`stddev`. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [2.0, 4.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.stddev_samp(dfn.col("a")).alias("v")] + ... ) + >>> result.collect_column("v")[0].as_py() + 2.0 """ return stddev(arg, filter=filter) -def var(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def var(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. This is an alias for :py:func:`var_samp`. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ return var_samp(expression, filter) -def var_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def var_pop(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the population variance of the argument. If using the builder functions described in ref:`_aggregation` this function ignores @@ -2026,12 +3697,19 @@ def var_pop(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: The variable to compute the variance for filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [0.0, 2.0]}) + >>> result = df.aggregate([], [dfn.functions.var_pop(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.var_pop(expression.expr, filter=filter_raw)) -def var_samp(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def var_samp(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. If using the builder functions described in ref:`_aggregation` this function ignores @@ -2040,15 +3718,31 @@ def var_samp(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: The variable to compute the variance for filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate([], [dfn.functions.var_samp(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None return Expr(f.var_sample(expression.expr, filter=filter_raw)) -def var_sample(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def var_sample(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. This is an alias for :py:func:`var_samp`. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.var_sample(dfn.col("a")).alias("v")] + ... ) + >>> result.collect_column("v")[0].as_py() + 1.0 """ return var_samp(expression, filter) @@ -2056,7 +3750,7 @@ def var_sample(expression: Expr, filter: Optional[Expr] = None) -> Expr: def regr_avgx( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the average of the independent variable ``x``. @@ -2070,6 +3764,14 @@ def regr_avgx( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_avgx(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 5.0 """ filter_raw = filter.expr if filter is not None else None @@ -2079,7 +3781,7 @@ def regr_avgx( def regr_avgy( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the average of the dependent variable ``y``. @@ -2093,6 +3795,14 @@ def regr_avgy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_avgy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2102,7 +3812,7 @@ def regr_avgy( def regr_count( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Counts the number of rows in which both expressions are not null. @@ -2116,6 +3826,14 @@ def regr_count( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [4.0, 5.0, 6.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_count(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None @@ -2125,7 +3843,7 @@ def regr_count( def regr_intercept( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the intercept from the linear regression. @@ -2139,6 +3857,15 @@ def regr_intercept( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], + ... [dfn.functions.regr_intercept(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 0.0 """ filter_raw = filter.expr if filter is not None else None @@ -2148,7 +3875,7 @@ def regr_intercept( def regr_r2( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the R-squared value from linear regression. @@ -2162,6 +3889,14 @@ def regr_r2( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_r2(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 1.0 """ filter_raw = filter.expr if filter is not None else None @@ -2171,7 +3906,7 @@ def regr_r2( def regr_slope( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the slope from linear regression. @@ -2185,6 +3920,14 @@ def regr_slope( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [2.0, 4.0, 6.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_slope(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2194,7 +3937,7 @@ def regr_slope( def regr_sxx( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the sum of squares of the independent variable ``x``. @@ -2208,6 +3951,14 @@ def regr_sxx( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_sxx(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2217,7 +3968,7 @@ def regr_sxx( def regr_sxy( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the sum of products of pairs of numbers. @@ -2231,6 +3982,14 @@ def regr_sxy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_sxy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2240,7 +3999,7 @@ def regr_sxy( def regr_syy( y: Expr, x: Expr, - filter: Optional[Expr] = None, + filter: Expr | None = None, ) -> Expr: """Computes the sum of squares of the dependent variable ``y``. @@ -2254,6 +4013,14 @@ def regr_syy( y: The linear regression dependent variable x: The linear regression independent variable filter: If provided, only compute against rows for which the filter is True + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"y": [1.0, 2.0, 3.0], "x": [1.0, 2.0, 3.0]}) + >>> result = df.aggregate( + ... [], [dfn.functions.regr_syy(dfn.col("y"), dfn.col("x")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 2.0 """ filter_raw = filter.expr if filter is not None else None @@ -2262,8 +4029,8 @@ def regr_syy( def first_value( expression: Expr, - filter: Optional[Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + filter: Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: """Returns the first value in a group of values. @@ -2282,7 +4049,7 @@ def first_value( For example:: - df.select(first_value(col("a"), order_by="ts")) + df.aggregate([], first_value(col("a"), order_by="ts")) """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2299,8 +4066,8 @@ def first_value( def last_value( expression: Expr, - filter: Optional[Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + filter: Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: """Returns the last value in a group of values. @@ -2319,7 +4086,7 @@ def last_value( For example:: - df.select(last_value(col("a"), order_by="ts")) + df.aggregate([], last_value(col("a"), order_by="ts")) """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2337,8 +4104,8 @@ def last_value( def nth_value( expression: Expr, n: int, - filter: Optional[Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + filter: Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, null_treatment: NullTreatment = NullTreatment.RESPECT_NULLS, ) -> Expr: """Returns the n-th value in a group of values. @@ -2358,7 +4125,7 @@ def nth_value( For example:: - df.select(nth_value(col("a"), 2, order_by="ts")) + df.aggregate([], nth_value(col("a"), 2, order_by="ts")) """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -2374,7 +4141,7 @@ def nth_value( ) -def bit_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def bit_and(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the bitwise AND of the argument. This aggregate function will bitwise compare every value in the input partition. @@ -2385,12 +4152,20 @@ def bit_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [7, 3]}) + >>> result = df.aggregate([], [dfn.functions.bit_and(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_and(expression.expr, filter=filter_raw)) -def bit_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def bit_or(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the bitwise OR of the argument. This aggregate function will bitwise compare every value in the input partition. @@ -2401,13 +4176,21 @@ def bit_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: Argument to perform bitwise calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2]}) + >>> result = df.aggregate([], [dfn.functions.bit_or(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 3 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_or(expression.expr, filter=filter_raw)) def bit_xor( - expression: Expr, distinct: bool = False, filter: Optional[Expr] = None + expression: Expr, distinct: bool = False, filter: Expr | None = None ) -> Expr: """Computes the bitwise XOR of the argument. @@ -2420,12 +4203,20 @@ def bit_xor( expression: Argument to perform bitwise calculation on distinct: If True, evaluate each unique value of expression only once filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [5, 3]}) + >>> result = df.aggregate([], [dfn.functions.bit_xor(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + 6 """ filter_raw = filter.expr if filter is not None else None return Expr(f.bit_xor(expression.expr, distinct=distinct, filter=filter_raw)) -def bool_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def bool_and(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the boolean AND of the argument. This aggregate function will compare every value in the input partition. These are @@ -2437,12 +4228,20 @@ def bool_and(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: Argument to perform calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [True, True, False]}) + >>> result = df.aggregate([], [dfn.functions.bool_and(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + False """ filter_raw = filter.expr if filter is not None else None return Expr(f.bool_and(expression.expr, filter=filter_raw)) -def bool_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: +def bool_or(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the boolean OR of the argument. This aggregate function will compare every value in the input partition. These are @@ -2454,6 +4253,14 @@ def bool_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: Args: expression: Argument to perform calculation on filter: If provided, only compute against rows for which the filter is True + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [False, False, True]}) + >>> result = df.aggregate([], [dfn.functions.bool_or(dfn.col("a")).alias("v")]) + >>> result.collect_column("v")[0].as_py() + True """ filter_raw = filter.expr if filter is not None else None return Expr(f.bool_or(expression.expr, filter=filter_raw)) @@ -2462,9 +4269,9 @@ def bool_or(expression: Expr, filter: Optional[Expr] = None) -> Expr: def lead( arg: Expr, shift_offset: int = 1, - default_value: Optional[Any] = None, - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + default_value: Any | None = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a lead window function. @@ -2520,9 +4327,9 @@ def lead( def lag( arg: Expr, shift_offset: int = 1, - default_value: Optional[Any] = None, - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + default_value: Any | None = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a lag window function. @@ -2573,8 +4380,8 @@ def lag( def row_number( - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a row number window function. @@ -2612,8 +4419,8 @@ def row_number( def rank( - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a rank window function. @@ -2656,8 +4463,8 @@ def rank( def dense_rank( - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a dense_rank window function. @@ -2695,8 +4502,8 @@ def dense_rank( def percent_rank( - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a percent_rank window function. @@ -2735,8 +4542,8 @@ def percent_rank( def cume_dist( - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a cumulative distribution window function. @@ -2776,8 +4583,8 @@ def cume_dist( def ntile( groups: int, - partition_by: Optional[list[Expr] | Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + partition_by: list[Expr] | Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Create a n-tile window function. @@ -2822,8 +4629,8 @@ def ntile( def string_agg( expression: Expr, delimiter: str, - filter: Optional[Expr] = None, - order_by: Optional[list[SortKey] | SortKey] = None, + filter: Expr | None = None, + order_by: list[SortKey] | SortKey | None = None, ) -> Expr: """Concatenates the input strings. @@ -2843,7 +4650,16 @@ def string_agg( For example:: - df.select(string_agg(col("a"), ",", order_by="b")) + df.aggregate([], string_agg(col("a"), ",", order_by="b")) + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["x", "y", "z"]}) + >>> result = df.aggregate( + ... [], [dfn.functions.string_agg(dfn.col("a"), ",", order_by="a").alias("s")]) + >>> result.collect_column("s")[0].as_py() + 'x,y,z' """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index 08d98d115..b804ac18b 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -17,7 +17,6 @@ """The default input source for DataFusion.""" -import glob from pathlib import Path from typing import Any @@ -84,6 +83,7 @@ def build_table( raise RuntimeError(msg) # Input could possibly be multiple files. Create a list if so - input_files = glob.glob(input_item) + input_path = Path(input_item) + input_files = [str(p) for p in input_path.parent.glob(input_path.name)] return SqlTable(table_name, columns, num_rows, input_files) diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 67dbc730f..4f9c3c516 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -31,6 +31,8 @@ from datafusion.dataframe import DataFrame from datafusion.expr import Expr + from .options import CsvReadOptions + def read_parquet( path: str | pathlib.Path, @@ -126,6 +128,7 @@ def read_csv( file_extension: str = ".csv", table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_compression_type: str | None = None, + options: CsvReadOptions | None = None, ) -> DataFrame: """Read a CSV data source. @@ -147,15 +150,12 @@ def read_csv( selected for data input. table_partition_cols: Partition columns. file_compression_type: File compression type. + options: Set advanced options for CSV reading. This cannot be + combined with any of the other options in this method. Returns: DataFrame representation of the read CSV files """ - if table_partition_cols is None: - table_partition_cols = [] - - path = [str(p) for p in path] if isinstance(path, list) else str(path) - return SessionContext.global_ctx().read_csv( path, schema, @@ -165,6 +165,7 @@ def read_csv( file_extension, table_partition_cols, file_compression_type, + options, ) diff --git a/python/datafusion/options.py b/python/datafusion/options.py new file mode 100644 index 000000000..ec19f37d0 --- /dev/null +++ b/python/datafusion/options.py @@ -0,0 +1,284 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Options for reading various file formats.""" + +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING + +import pyarrow as pa + +from datafusion.expr import sort_list_to_raw_sort_list + +if TYPE_CHECKING: + from datafusion.expr import SortExpr + +from ._internal import options + +__all__ = ["CsvReadOptions"] + +DEFAULT_MAX_INFER_SCHEMA = 1000 + + +class CsvReadOptions: + """Options for reading CSV files. + + This class provides a builder pattern for configuring CSV reading options. + All methods starting with ``with_`` return ``self`` to allow method chaining. + """ + + def __init__( + self, + *, + has_header: bool = True, + delimiter: str = ",", + quote: str = '"', + terminator: str | None = None, + escape: str | None = None, + comment: str | None = None, + newlines_in_values: bool = False, + schema: pa.Schema | None = None, + schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA, + file_extension: str = ".csv", + table_partition_cols: list[tuple[str, pa.DataType]] | None = None, + file_compression_type: str = "", + file_sort_order: list[list[SortExpr]] | None = None, + null_regex: str | None = None, + truncated_rows: bool = False, + ) -> None: + """Initialize CsvReadOptions. + + Args: + has_header: Does the CSV file have a header row? If schema inference + is run on a file with no headers, default column names are created. + delimiter: Column delimiter character. Must be a single ASCII character. + quote: Quote character for fields containing delimiters or newlines. + Must be a single ASCII character. + terminator: Optional line terminator character. If ``None``, uses CRLF. + Must be a single ASCII character. + escape: Optional escape character for quotes. Must be a single ASCII + character. + comment: If specified, lines beginning with this character are ignored. + Must be a single ASCII character. + newlines_in_values: Whether newlines in quoted values are supported. + Parsing newlines in quoted values may be affected by execution + behavior such as parallel file scanning. Setting this to ``True`` + ensures that newlines in values are parsed successfully, which may + reduce performance. + schema: Optional PyArrow schema representing the CSV files. If ``None``, + the CSV reader will try to infer it based on data in the file. + schema_infer_max_records: Maximum number of rows to read from CSV files + for schema inference if needed. + file_extension: File extension; only files with this extension are + selected for data input. + table_partition_cols: Partition columns as a list of tuples of + (column_name, data_type). + file_compression_type: File compression type. Supported values are + ``"gzip"``, ``"bz2"``, ``"xz"``, ``"zstd"``, or empty string for + uncompressed. + file_sort_order: Optional sort order of the files as a list of sort + expressions per file. + null_regex: Optional regex pattern to match null values in the CSV. + truncated_rows: Whether to allow truncated rows when parsing. By default + this is ``False`` and will error if the CSV rows have different + lengths. When set to ``True``, it will allow records with less than + the expected number of columns and fill the missing columns with + nulls. If the record's schema is not nullable, it will still return + an error. + """ + validate_single_character("delimiter", delimiter) + validate_single_character("quote", quote) + validate_single_character("terminator", terminator) + validate_single_character("escape", escape) + validate_single_character("comment", comment) + + self.has_header = has_header + self.delimiter = delimiter + self.quote = quote + self.terminator = terminator + self.escape = escape + self.comment = comment + self.newlines_in_values = newlines_in_values + self.schema = schema + self.schema_infer_max_records = schema_infer_max_records + self.file_extension = file_extension + self.table_partition_cols = table_partition_cols or [] + self.file_compression_type = file_compression_type + self.file_sort_order = file_sort_order or [] + self.null_regex = null_regex + self.truncated_rows = truncated_rows + + def with_has_header(self, has_header: bool) -> CsvReadOptions: + """Configure whether the CSV has a header row.""" + self.has_header = has_header + return self + + def with_delimiter(self, delimiter: str) -> CsvReadOptions: + """Configure the column delimiter.""" + self.delimiter = delimiter + return self + + def with_quote(self, quote: str) -> CsvReadOptions: + """Configure the quote character.""" + self.quote = quote + return self + + def with_terminator(self, terminator: str | None) -> CsvReadOptions: + """Configure the line terminator character.""" + self.terminator = terminator + return self + + def with_escape(self, escape: str | None) -> CsvReadOptions: + """Configure the escape character.""" + self.escape = escape + return self + + def with_comment(self, comment: str | None) -> CsvReadOptions: + """Configure the comment character.""" + self.comment = comment + return self + + def with_newlines_in_values(self, newlines_in_values: bool) -> CsvReadOptions: + """Configure whether newlines in values are supported.""" + self.newlines_in_values = newlines_in_values + return self + + def with_schema(self, schema: pa.Schema | None) -> CsvReadOptions: + """Configure the schema.""" + self.schema = schema + return self + + def with_schema_infer_max_records( + self, schema_infer_max_records: int + ) -> CsvReadOptions: + """Configure maximum records for schema inference.""" + self.schema_infer_max_records = schema_infer_max_records + return self + + def with_file_extension(self, file_extension: str) -> CsvReadOptions: + """Configure the file extension filter.""" + self.file_extension = file_extension + return self + + def with_table_partition_cols( + self, table_partition_cols: list[tuple[str, pa.DataType]] + ) -> CsvReadOptions: + """Configure table partition columns.""" + self.table_partition_cols = table_partition_cols + return self + + def with_file_compression_type(self, file_compression_type: str) -> CsvReadOptions: + """Configure file compression type.""" + self.file_compression_type = file_compression_type + return self + + def with_file_sort_order( + self, file_sort_order: list[list[SortExpr]] + ) -> CsvReadOptions: + """Configure file sort order.""" + self.file_sort_order = file_sort_order + return self + + def with_null_regex(self, null_regex: str | None) -> CsvReadOptions: + """Configure null value regex pattern.""" + self.null_regex = null_regex + return self + + def with_truncated_rows(self, truncated_rows: bool) -> CsvReadOptions: + """Configure whether to allow truncated rows.""" + self.truncated_rows = truncated_rows + return self + + def to_inner(self) -> options.CsvReadOptions: + """Convert this object into the underlying Rust structure. + + This is intended for internal use only. + """ + file_sort_order = ( + [] + if self.file_sort_order is None + else [ + sort_list_to_raw_sort_list(sort_list) + for sort_list in self.file_sort_order + ] + ) + + return options.CsvReadOptions( + has_header=self.has_header, + delimiter=ord(self.delimiter[0]) if self.delimiter else ord(","), + quote=ord(self.quote[0]) if self.quote else ord('"'), + terminator=ord(self.terminator[0]) if self.terminator else None, + escape=ord(self.escape[0]) if self.escape else None, + comment=ord(self.comment[0]) if self.comment else None, + newlines_in_values=self.newlines_in_values, + schema=self.schema, + schema_infer_max_records=self.schema_infer_max_records, + file_extension=self.file_extension, + table_partition_cols=_convert_table_partition_cols( + self.table_partition_cols + ), + file_compression_type=self.file_compression_type or "", + file_sort_order=file_sort_order, + null_regex=self.null_regex, + truncated_rows=self.truncated_rows, + ) + + +def validate_single_character(name: str, value: str | None) -> None: + if value is not None and len(value) != 1: + message = f"{name} must be a single character" + raise ValueError(message) + + +def _convert_table_partition_cols( + table_partition_cols: list[tuple[str, str | pa.DataType]], +) -> list[tuple[str, pa.DataType]]: + warn = False + converted_table_partition_cols = [] + + for col, data_type in table_partition_cols: + if isinstance(data_type, str): + warn = True + if data_type == "string": + converted_data_type = pa.string() + elif data_type == "int": + converted_data_type = pa.int32() + else: + message = ( + f"Unsupported literal data type '{data_type}' for partition " + "column. Supported types are 'string' and 'int'" + ) + raise ValueError(message) + else: + converted_data_type = data_type + + converted_table_partition_cols.append((col, converted_data_type)) + + if warn: + message = ( + "using literals for table_partition_cols data types is deprecated," + "use pyarrow types instead" + ) + warnings.warn( + message, + category=DeprecationWarning, + stacklevel=2, + ) + + return converted_table_partition_cols diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index 0b7bebcb3..fb54fd624 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -98,6 +98,12 @@ def to_proto(self) -> bytes: """ return self._raw_plan.to_proto() + def __eq__(self, other: LogicalPlan) -> bool: + """Test equality.""" + if not isinstance(other, LogicalPlan): + return False + return self._raw_plan.__eq__(other._raw_plan) + class ExecutionPlan: """Represent nodes in the DataFusion Physical Plan.""" diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index 556eaa786..c24cde0ac 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -46,6 +46,26 @@ def to_pyarrow(self) -> pa.RecordBatch: """Convert to :py:class:`pa.RecordBatch`.""" return self.record_batch.to_pyarrow() + def __arrow_c_array__( + self, requested_schema: object | None = None + ) -> tuple[object, object]: + """Export the record batch via the Arrow C Data Interface. + + This allows zero-copy interchange with libraries that support the + `Arrow PyCapsule interface `_. + + Args: + requested_schema: Attempt to provide the record batch using this + schema. Only straightforward projections such as column + selection or reordering are applied. + + Returns: + Two Arrow PyCapsule objects representing the ``ArrowArray`` and + ``ArrowSchema``. + """ + return self.record_batch.__arrow_c_array__(requested_schema) + class RecordBatchStream: """This class represents a stream of record batches. @@ -63,19 +83,19 @@ def next(self) -> RecordBatch: return next(self) async def __anext__(self) -> RecordBatch: - """Async iterator function.""" + """Return the next :py:class:`RecordBatch` in the stream asynchronously.""" next_batch = await self.rbs.__anext__() return RecordBatch(next_batch) def __next__(self) -> RecordBatch: - """Iterator function.""" + """Return the next :py:class:`RecordBatch` in the stream.""" next_batch = next(self.rbs) return RecordBatch(next_batch) def __aiter__(self) -> typing_extensions.Self: - """Async iterator function.""" + """Return an asynchronous iterator over record batches.""" return self def __iter__(self) -> typing_extensions.Self: - """Iterator function.""" + """Return an iterator over record batches.""" return self diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index f10adfb0c..3115238fa 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -67,6 +67,26 @@ def encode(self) -> bytes: """ return self.plan_internal.encode() + def to_json(self) -> str: + """Get the JSON representation of the Substrait plan. + + Returns: + A JSON representation of the Substrait plan. + """ + return self.plan_internal.to_json() + + @staticmethod + def from_json(json: str) -> Plan: + """Parse a plan from a JSON string representation. + + Args: + json: JSON representation of a Substrait plan. + + Returns: + Plan object representing the Substrait plan. + """ + return Plan(substrait_internal.Plan.from_json(json)) + @deprecated("Use `Plan` instead.") class plan(Plan): # noqa: N801 diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py index 67568e313..eef23e741 100644 --- a/python/datafusion/user_defined.py +++ b/python/datafusion/user_defined.py @@ -22,15 +22,19 @@ import functools from abc import ABCMeta, abstractmethod from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, TypeVar, overload +from typing import TYPE_CHECKING, Any, Protocol, TypeGuard, TypeVar, cast, overload import pyarrow as pa import datafusion._internal as df_internal +from datafusion import SessionContext from datafusion.expr import Expr if TYPE_CHECKING: + from _typeshed import CapsuleType as _PyCapsule + _R = TypeVar("_R", bound=pa.DataType) + from collections.abc import Callable, Sequence class Volatility(Enum): @@ -77,12 +81,38 @@ def __str__(self) -> str: return self.name.lower() +def data_type_or_field_to_field(value: pa.DataType | pa.Field, name: str) -> pa.Field: + """Helper function to return a Field from either a Field or DataType.""" + if isinstance(value, pa.Field): + return value + return pa.field(name, type=value) + + +def data_types_or_fields_to_field_list( + inputs: Sequence[pa.Field | pa.DataType] | pa.Field | pa.DataType, +) -> list[pa.Field]: + """Helper function to return a list of Fields.""" + if isinstance(inputs, pa.DataType): + return [pa.field("value", type=inputs)] + if isinstance(inputs, pa.Field): + return [inputs] + + return [ + data_type_or_field_to_field(v, f"value_{idx}") for (idx, v) in enumerate(inputs) + ] + + class ScalarUDFExportable(Protocol): """Type hint for object that has __datafusion_scalar_udf__ PyCapsule.""" def __datafusion_scalar_udf__(self) -> object: ... # noqa: D105 +def _is_pycapsule(value: object) -> TypeGuard[_PyCapsule]: + """Return ``True`` when ``value`` is a CPython ``PyCapsule``.""" + return value.__class__.__name__ == "PyCapsule" + + class ScalarUDF: """Class for performing scalar user-defined functions (UDF). @@ -94,8 +124,8 @@ def __init__( self, name: str, func: Callable[..., _R], - input_types: pa.DataType | list[pa.DataType], - return_type: _R, + input_fields: list[pa.Field], + return_field: _R, volatility: Volatility | str, ) -> None: """Instantiate a scalar user-defined function (UDF). @@ -105,10 +135,10 @@ def __init__( if hasattr(func, "__datafusion_scalar_udf__"): self._udf = df_internal.ScalarUDF.from_pycapsule(func) return - if isinstance(input_types, pa.DataType): - input_types = [input_types] + if isinstance(input_fields, pa.DataType): + input_fields = [input_fields] self._udf = df_internal.ScalarUDF( - name, func, input_types, return_type, str(volatility) + name, func, input_fields, return_field, str(volatility) ) def __repr__(self) -> str: @@ -127,20 +157,20 @@ def __call__(self, *args: Expr) -> Expr: @overload @staticmethod def udf( - input_types: list[pa.DataType], - return_type: _R, + input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field, + return_field: pa.DataType | pa.Field, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> Callable[..., ScalarUDF]: ... @overload @staticmethod def udf( func: Callable[..., _R], - input_types: list[pa.DataType], - return_type: _R, + input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field, + return_field: pa.DataType | pa.Field, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> ScalarUDF: ... @overload @@ -154,20 +184,24 @@ def udf(*args: Any, **kwargs: Any): # noqa: D417 This class can be used both as either a function or a decorator. Usage: - - As a function: ``udf(func, input_types, return_type, volatility, name)``. - - As a decorator: ``@udf(input_types, return_type, volatility, name)``. + - As a function: ``udf(func, input_fields, return_field, volatility, name)``. + - As a decorator: ``@udf(input_fields, return_field, volatility, name)``. When used a decorator, do **not** pass ``func`` explicitly. + In lieu of passing a PyArrow Field, you can pass a DataType for simplicity. + When you do so, it will be assumed that the nullability of the inputs and + output are True and that they have no metadata. + Args: func (Callable, optional): Only needed when calling as a function. Skip this argument when using `udf` as a decorator. If you have a Rust backed ScalarUDF within a PyCapsule, you can pass this parameter and ignore the rest. They will be determined directly from the underlying function. See the online documentation for more information. - input_types (list[pa.DataType]): The data types of the arguments - to ``func``. This list must be of the same length as the number of - arguments. - return_type (_R): The data type of the return value from the function. + input_fields (list[pa.Field | pa.DataType]): The data types or Fields + of the arguments to ``func``. This list must be of the same length + as the number of arguments. + return_field (_R): The field of the return value from the function. volatility (Volatility | str): See `Volatility` for allowed values. name (Optional[str]): A descriptive name for the function. @@ -187,14 +221,14 @@ def double_func(x): @udf([pa.int32()], pa.int32(), "volatile", "double_it") def double_udf(x): return x * 2 - """ + """ # noqa: W505 E501 def _function( func: Callable[..., _R], - input_types: list[pa.DataType], - return_type: _R, + input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field, + return_field: pa.DataType | pa.Field, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> ScalarUDF: if not callable(func): msg = "`func` argument must be callable" @@ -204,27 +238,29 @@ def _function( name = func.__qualname__.lower() else: name = func.__class__.__name__.lower() + input_fields = data_types_or_fields_to_field_list(input_fields) + return_field = data_type_or_field_to_field(return_field, "value") return ScalarUDF( name=name, func=func, - input_types=input_types, - return_type=return_type, + input_fields=input_fields, + return_field=return_field, volatility=volatility, ) def _decorator( - input_types: list[pa.DataType], - return_type: _R, + input_fields: Sequence[pa.DataType | pa.Field] | pa.DataType | pa.Field, + return_field: _R, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> Callable: - def decorator(func: Callable): + def decorator(func: Callable) -> Callable: udf_caller = ScalarUDF.udf( - func, input_types, return_type, volatility, name + func, input_fields, return_field, volatility, name ) @functools.wraps(func) - def wrapper(*args: Any, **kwargs: Any): + def wrapper(*args: Any, **kwargs: Any) -> Callable: return udf_caller(*args, **kwargs) return wrapper @@ -251,8 +287,8 @@ def from_pycapsule(func: ScalarUDFExportable) -> ScalarUDF: return ScalarUDF( name=name, func=func, - input_types=None, - return_type=None, + input_fields=None, + return_field=None, volatility=None, ) @@ -262,7 +298,16 @@ class Accumulator(metaclass=ABCMeta): @abstractmethod def state(self) -> list[pa.Scalar]: - """Return the current state.""" + """Return the current state. + + While this function template expects PyArrow Scalar values return type, + you can return any value that can be converted into a Scalar. This + includes basic Python data types such as integers and strings. In + addition to primitive types, we currently support PyArrow, nanoarrow, + and arro3 objects in addition to primitive data types. Other objects + that support the Arrow FFI standard will be given a "best attempt" at + conversion to scalar objects. + """ @abstractmethod def update(self, *values: pa.Array) -> None: @@ -274,7 +319,16 @@ def merge(self, states: list[pa.Array]) -> None: @abstractmethod def evaluate(self) -> pa.Scalar: - """Return the resultant value.""" + """Return the resultant value. + + While this function template expects a PyArrow Scalar value return type, + you can return any value that can be converted into a Scalar. This + includes basic Python data types such as integers and strings. In + addition to primitive types, we currently support PyArrow, nanoarrow, + and arro3 objects in addition to primitive data types. Other objects + that support the Arrow FFI standard will be given a "best attempt" at + conversion to scalar objects. + """ class AggregateUDFExportable(Protocol): @@ -290,6 +344,7 @@ class AggregateUDF: also :py:class:`ScalarUDF` for operating on a row by row basis. """ + @overload def __init__( self, name: str, @@ -298,6 +353,27 @@ def __init__( return_type: pa.DataType, state_type: list[pa.DataType], volatility: Volatility | str, + ) -> None: ... + + @overload + def __init__( + self, + name: str, + accumulator: AggregateUDFExportable, + input_types: None = ..., + return_type: None = ..., + state_type: None = ..., + volatility: None = ..., + ) -> None: ... + + def __init__( + self, + name: str, + accumulator: Callable[[], Accumulator] | AggregateUDFExportable, + input_types: list[pa.DataType] | None, + return_type: pa.DataType | None, + state_type: list[pa.DataType] | None, + volatility: Volatility | str | None, ) -> None: """Instantiate a user-defined aggregate function (UDAF). @@ -307,6 +383,18 @@ def __init__( if hasattr(accumulator, "__datafusion_aggregate_udf__"): self._udaf = df_internal.AggregateUDF.from_pycapsule(accumulator) return + if ( + input_types is None + or return_type is None + or state_type is None + or volatility is None + ): + msg = ( + "`input_types`, `return_type`, `state_type`, and `volatility` " + "must be provided when `accumulator` is callable." + ) + raise TypeError(msg) + self._udaf = df_internal.AggregateUDF( name, accumulator, @@ -336,7 +424,7 @@ def udaf( return_type: pa.DataType, state_type: list[pa.DataType], volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> Callable[..., AggregateUDF]: ... @overload @@ -347,9 +435,17 @@ def udaf( return_type: pa.DataType, state_type: list[pa.DataType], volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> AggregateUDF: ... + @overload + @staticmethod + def udaf(accum: AggregateUDFExportable) -> AggregateUDF: ... + + @overload + @staticmethod + def udaf(accum: _PyCapsule) -> AggregateUDF: ... + @staticmethod def udaf(*args: Any, **kwargs: Any): # noqa: D417, C901 """Create a new User-Defined Aggregate Function (UDAF). @@ -429,7 +525,7 @@ def _function( return_type: pa.DataType, state_type: list[pa.DataType], volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> AggregateUDF: if not callable(accum): msg = "`func` must be callable." @@ -455,7 +551,7 @@ def _decorator( return_type: pa.DataType, state_type: list[pa.DataType], volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> Callable[..., Callable[..., Expr]]: def decorator(accum: Callable[[], Accumulator]) -> Callable[..., Expr]: udaf_caller = AggregateUDF.udaf( @@ -470,7 +566,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return decorator - if hasattr(args[0], "__datafusion_aggregate_udf__"): + if hasattr(args[0], "__datafusion_aggregate_udf__") or _is_pycapsule(args[0]): return AggregateUDF.from_pycapsule(args[0]) if args and callable(args[0]): @@ -480,16 +576,22 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: return _decorator(*args, **kwargs) @staticmethod - def from_pycapsule(func: AggregateUDFExportable) -> AggregateUDF: + def from_pycapsule(func: AggregateUDFExportable | _PyCapsule) -> AggregateUDF: """Create an Aggregate UDF from AggregateUDF PyCapsule object. This function will instantiate a Aggregate UDF that uses a DataFusion AggregateUDF that is exported via the FFI bindings. """ - name = str(func.__class__) + if _is_pycapsule(func): + aggregate = cast("AggregateUDF", object.__new__(AggregateUDF)) + aggregate._udaf = df_internal.AggregateUDF.from_pycapsule(func) + return aggregate + + capsule = cast("AggregateUDFExportable", func) + name = str(capsule.__class__) return AggregateUDF( name=name, - accumulator=func, + accumulator=capsule, input_types=None, return_type=None, state_type=None, @@ -708,7 +810,7 @@ def udwf( input_types: pa.DataType | list[pa.DataType], return_type: pa.DataType, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> Callable[..., WindowUDF]: ... @overload @@ -718,7 +820,7 @@ def udwf( input_types: pa.DataType | list[pa.DataType], return_type: pa.DataType, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> WindowUDF: ... @staticmethod @@ -787,7 +889,7 @@ def _create_window_udf( input_types: pa.DataType | list[pa.DataType], return_type: pa.DataType, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> WindowUDF: """Create a WindowUDF instance from function arguments.""" if not callable(func): @@ -825,7 +927,7 @@ def _create_window_udf_decorator( input_types: pa.DataType | list[pa.DataType], return_type: pa.DataType, volatility: Volatility | str, - name: Optional[str] = None, + name: str | None = None, ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: """Create a decorator for a WindowUDF.""" @@ -867,16 +969,14 @@ class TableFunction: """ def __init__( - self, - name: str, - func: Callable[[], any], + self, name: str, func: Callable[[], any], ctx: SessionContext | None = None ) -> None: """Instantiate a user-defined table function (UDTF). See :py:func:`udtf` for a convenience function and argument descriptions. """ - self._udtf = df_internal.TableFunction(name, func) + self._udtf = df_internal.TableFunction(name, func, ctx) def __call__(self, *args: Expr) -> Any: """Execute the UDTF and return a table provider.""" @@ -922,7 +1022,7 @@ def _create_table_udf( @staticmethod def _create_table_udf_decorator( - name: Optional[str] = None, + name: str | None = None, ) -> Callable[[Callable[[], WindowEvaluator]], Callable[..., Expr]]: """Create a decorator for a WindowUDF.""" diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 9548fbfe4..26ed7281d 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -17,7 +17,7 @@ import pyarrow as pa import pytest -from datafusion import SessionContext +from datafusion import DataFrame, SessionContext from pyarrow.csv import write_csv @@ -49,3 +49,12 @@ def database(ctx, tmp_path): delimiter=",", schema_infer_max_records=10, ) + + +@pytest.fixture +def fail_collect(monkeypatch): + def _fail_collect(self, *args, **kwargs): # pragma: no cover - failure path + msg = "collect should not be called" + raise AssertionError(msg) + + monkeypatch.setattr(DataFrame, "collect", _fail_collect) diff --git a/python/tests/test_aggregation.py b/python/tests/test_aggregation.py index 17767ea1a..240332848 100644 --- a/python/tests/test_aggregation.py +++ b/python/tests/test_aggregation.py @@ -88,7 +88,7 @@ def df_aggregate_100(): f.covar_samp(column("b"), column("c")), lambda a, b, c, d: np.array(np.cov(b, c, ddof=1)[0][1]), ), - # f.grouping(col_a), # No physical plan implemented yet + # f.grouping(col_a), # noqa: ERA001 No physical plan implemented yet (f.max(column("a")), lambda a, b, c, d: np.array(np.max(a))), (f.mean(column("b")), lambda a, b, c, d: np.array(np.mean(b))), (f.median(column("b")), lambda a, b, c, d: np.array(np.median(b))), @@ -141,14 +141,14 @@ def test_aggregation_stats(df, agg_expr, calc_expected): ), ( f.approx_percentile_cont_with_weight(column("b"), lit(0.6), 0.5), - pa.array([6], type=pa.float64()), + pa.array([4], type=pa.float64()), False, ), ( f.approx_percentile_cont_with_weight( column("b").sort(ascending=False, nulls_first=False), lit(0.6), 0.5 ), - pa.array([6], type=pa.float64()), + pa.array([4], type=pa.float64()), False, ), ( diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py index 08f494dee..c89da36bf 100644 --- a/python/tests/test_catalog.py +++ b/python/tests/test_catalog.py @@ -16,11 +16,16 @@ # under the License. from __future__ import annotations +from typing import TYPE_CHECKING + import datafusion as dfn import pyarrow as pa import pyarrow.dataset as ds import pytest -from datafusion import SessionContext, Table, udtf +from datafusion import Catalog, SessionContext, Table, udtf + +if TYPE_CHECKING: + from datafusion.catalog import CatalogProvider, CatalogProviderExportable # Note we take in `database` as a variable even though we don't use @@ -76,6 +81,12 @@ def table_exist(self, name: str) -> bool: return name in self.tables +class CustomErrorSchemaProvider(CustomSchemaProvider): + def table(self, name: str) -> Table | None: + message = f"{name} is not an acceptable name" + raise ValueError(message) + + class CustomCatalogProvider(dfn.catalog.CatalogProvider): def __init__(self): self.schemas = {"my_schema": CustomSchemaProvider()} @@ -93,6 +104,40 @@ def deregister_schema(self, name, cascade: bool): del self.schemas[name] +class CustomCatalogProviderList(dfn.catalog.CatalogProviderList): + def __init__(self): + self.catalogs = {"my_catalog": CustomCatalogProvider()} + + def catalog_names(self) -> set[str]: + return set(self.catalogs.keys()) + + def catalog(self, name: str) -> Catalog | None: + return self.catalogs[name] + + def register_catalog( + self, name: str, catalog: CatalogProviderExportable | CatalogProvider | Catalog + ) -> None: + self.catalogs[name] = catalog + + +class CustomTableProviderFactory(dfn.catalog.TableProviderFactory): + def create(self, cmd: dfn.expr.CreateExternalTable): + assert cmd.name() == "test_table_factory" + return create_dataset() + + +def test_python_catalog_provider_list(ctx: SessionContext): + ctx.register_catalog_provider_list(CustomCatalogProviderList()) + + # Ensure `datafusion` catalog does not exist since + # we replaced the catalog list + assert ctx.catalog_names() == {"my_catalog"} + + # Ensure registering works + ctx.register_catalog_provider("second_catalog", Catalog.memory_catalog()) + assert ctx.catalog_names() == {"my_catalog", "second_catalog"} + + def test_python_catalog_provider(ctx: SessionContext): ctx.register_catalog_provider("my_catalog", CustomCatalogProvider()) @@ -186,6 +231,33 @@ def test_schema_register_table_with_pyarrow_dataset(ctx: SessionContext): schema.deregister_table(table_name) +def test_exception_not_mangled(ctx: SessionContext): + """Test registering all python providers and running a query against them.""" + + catalog_name = "custom_catalog" + schema_name = "custom_schema" + + ctx.register_catalog_provider(catalog_name, CustomCatalogProvider()) + + catalog = ctx.catalog(catalog_name) + + # Clean out previous schemas if they exist so we can start clean + for schema_name in catalog.schema_names(): + catalog.deregister_schema(schema_name, cascade=False) + + catalog.register_schema(schema_name, CustomErrorSchemaProvider()) + + schema = catalog.schema(schema_name) + + for table_name in schema.table_names(): + schema.deregister_table(table_name) + + schema.register_table("test_table", create_dataset()) + + with pytest.raises(ValueError, match=r"^test_table is not an acceptable name$"): + ctx.sql(f"select * from {catalog_name}.{schema_name}.test_table") + + def test_in_end_to_end_python_providers(ctx: SessionContext): """Test registering all python providers and running a query against them.""" @@ -248,3 +320,24 @@ def my_table_function_udtf() -> Table: assert len(result[0]) == 1 assert len(result[0][0]) == 1 assert result[0][0][0].as_py() == 3 + + +def test_register_python_table_provider_factory(ctx: SessionContext): + ctx.register_table_factory("CUSTOM_FACTORY", CustomTableProviderFactory()) + + ctx.sql(""" + CREATE EXTERNAL TABLE test_table_factory + STORED AS CUSTOM_FACTORY + LOCATION foo; + """).collect() + + result = ctx.sql("SELECT * FROM test_table_factory;").collect() + + expect = [ + pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"], + ) + ] + + assert result == expect diff --git a/python/tests/test_context.py b/python/tests/test_context.py index 94d1e6a39..5df6ed20f 100644 --- a/python/tests/test_context.py +++ b/python/tests/test_context.py @@ -22,6 +22,7 @@ import pyarrow.dataset as ds import pytest from datafusion import ( + CsvReadOptions, DataFrame, RuntimeEnvBuilder, SessionConfig, @@ -357,10 +358,16 @@ def test_register_table_from_dataframe(ctx): assert [b.to_pydict() for b in result] == [{"a": [1, 2]}] -def test_register_table_from_dataframe_into_view(ctx): +@pytest.mark.parametrize("temporary", [True, False]) +def test_register_table_from_dataframe_into_view(ctx, temporary): df = ctx.from_pydict({"a": [1, 2]}) - table = df.into_view() + table = df.into_view(temporary=temporary) assert isinstance(table, Table) + if temporary: + assert table.kind == "temporary" + else: + assert table.kind == "view" + ctx.register_table("view_tbl", table) result = ctx.sql("SELECT * FROM view_tbl").collect() assert [b.to_pydict() for b in result] == [{"a": [1, 2]}] @@ -620,6 +627,8 @@ def test_read_csv_list(ctx): def test_read_csv_compressed(ctx, tmp_path): test_data_path = pathlib.Path("testing/data/csv/aggregate_test_100.csv") + expected = ctx.read_csv(test_data_path).collect() + # File compression type gzip_path = tmp_path / "aggregate_test_100.csv.gz" @@ -630,7 +639,13 @@ def test_read_csv_compressed(ctx, tmp_path): gzipped_file.writelines(csv_file) csv_df = ctx.read_csv(gzip_path, file_extension=".gz", file_compression_type="gz") - csv_df.select(column("c1")).show() + assert csv_df.collect() == expected + + csv_df = ctx.read_csv( + gzip_path, + options=CsvReadOptions(file_extension=".gz", file_compression_type="gz"), + ) + assert csv_df.collect() == expected def test_read_parquet(ctx): @@ -704,3 +719,154 @@ def test_create_dataframe_with_global_ctx(batch): result = df.collect()[0].column(0) assert result == pa.array([4, 5, 6]) + + +def test_csv_read_options_builder_pattern(): + """Test CsvReadOptions builder pattern.""" + from datafusion import CsvReadOptions + + options = ( + CsvReadOptions() + .with_has_header(False) # noqa: FBT003 + .with_delimiter("|") + .with_quote("'") + .with_schema_infer_max_records(2000) + .with_truncated_rows(True) # noqa: FBT003 + .with_newlines_in_values(True) # noqa: FBT003 + .with_file_extension(".tsv") + ) + assert options.has_header is False + assert options.delimiter == "|" + assert options.quote == "'" + assert options.schema_infer_max_records == 2000 + assert options.truncated_rows is True + assert options.newlines_in_values is True + assert options.file_extension == ".tsv" + + +def read_csv_with_options_inner( + tmp_path: pathlib.Path, + csv_content: str, + options: CsvReadOptions, + expected: pa.RecordBatch, + as_read: bool, + global_ctx: bool, +) -> None: + from datafusion import SessionContext + + # Create a test CSV file + group_dir = tmp_path / "group=a" + group_dir.mkdir(exist_ok=True) + + csv_path = group_dir / "test.csv" + csv_path.write_text(csv_content, newline="\n") + + ctx = SessionContext() + + if as_read: + if global_ctx: + from datafusion.io import read_csv + + df = read_csv(str(tmp_path), options=options) + else: + df = ctx.read_csv(str(tmp_path), options=options) + else: + ctx.register_csv("test_table", str(tmp_path), options=options) + df = ctx.sql("SELECT * FROM test_table") + df.show() + + # Verify the data + result = df.collect() + assert len(result) == 1 + assert result[0] == expected + + +@pytest.mark.parametrize( + ("as_read", "global_ctx"), + [ + (True, True), + (True, False), + (False, False), + ], +) +def test_read_csv_with_options(tmp_path, as_read, global_ctx): + """Test reading CSV with CsvReadOptions.""" + + csv_content = "Alice;30;|New York; NY|\nBob;25\n#Charlie;35;Paris\nPhil;75;Detroit' MI\nKarin;50;|Stockholm\nSweden|" # noqa: E501 + + # Some of the read options are difficult to test in combination + # such as schema and schema_infer_max_records so run multiple tests + # file_sort_order doesn't impact reading, but included here to ensure + # all options parse correctly + options = CsvReadOptions( + has_header=False, + delimiter=";", + quote="|", + terminator="\n", + escape="\\", + comment="#", + newlines_in_values=True, + schema_infer_max_records=1, + null_regex="[pP]+aris", + truncated_rows=True, + file_sort_order=[[column("column_1").sort(), column("column_2")], ["column_3"]], + ) + + expected = pa.RecordBatch.from_arrays( + [ + pa.array(["Alice", "Bob", "Phil", "Karin"]), + pa.array([30, 25, 75, 50]), + pa.array(["New York; NY", None, "Detroit' MI", "Stockholm\nSweden"]), + ], + names=["column_1", "column_2", "column_3"], + ) + + read_csv_with_options_inner( + tmp_path, csv_content, options, expected, as_read, global_ctx + ) + + schema = pa.schema( + [ + pa.field("name", pa.string(), nullable=False), + pa.field("age", pa.float32(), nullable=False), + pa.field("location", pa.string(), nullable=True), + ] + ) + options.with_schema(schema) + + expected = pa.RecordBatch.from_arrays( + [ + pa.array(["Alice", "Bob", "Phil", "Karin"]), + pa.array([30.0, 25.0, 75.0, 50.0]), + pa.array(["New York; NY", None, "Detroit' MI", "Stockholm\nSweden"]), + ], + schema=schema, + ) + + read_csv_with_options_inner( + tmp_path, csv_content, options, expected, as_read, global_ctx + ) + + csv_content = "name,age\nAlice,30\nBob,25\nCharlie,35\nDiego,40\nEmily,15" + + expected = pa.RecordBatch.from_arrays( + [ + pa.array(["Alice", "Bob", "Charlie", "Diego", "Emily"]), + pa.array([30, 25, 35, 40, 15]), + pa.array(["a", "a", "a", "a", "a"]), + ], + schema=pa.schema( + [ + pa.field("name", pa.string(), nullable=True), + pa.field("age", pa.int64(), nullable=True), + pa.field("group", pa.string(), nullable=False), + ] + ), + ) + options = CsvReadOptions( + table_partition_cols=[("group", pa.string())], + ) + + read_csv_with_options_inner( + tmp_path, csv_content, options, expected, as_read, global_ctx + ) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 9317711f4..759d6278c 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -21,6 +21,7 @@ import re import threading import time +from pathlib import Path from typing import Any import pyarrow as pa @@ -31,10 +32,12 @@ InsertOp, ParquetColumnOptions, ParquetWriterOptions, + RecordBatch, SessionContext, WindowFrame, column, literal, + udf, ) from datafusion import ( col as df_col, @@ -52,6 +55,8 @@ from datafusion.expr import EXPR_TYPE_ERROR, Window from pyarrow.csv import write_csv +pa_cffi = pytest.importorskip("pyarrow.cffi") + MB = 1024 * 1024 @@ -86,6 +91,39 @@ def large_df(): return ctx.from_arrow(batch) +@pytest.fixture +def large_multi_batch_df(): + """Create a DataFrame with multiple record batches for testing stream behavior. + + This fixture creates 10 batches of 10,000 rows each (100,000 rows total), + ensuring the DataFrame spans multiple batches. This is essential for testing + that memory limits actually cause early stream termination rather than + truncating all collected data. + """ + ctx = SessionContext() + + # Create multiple batches, each with 10,000 rows + batches = [] + rows_per_batch = 10000 + num_batches = 10 + + for batch_idx in range(num_batches): + start_row = batch_idx * rows_per_batch + end_row = start_row + rows_per_batch + data = { + "a": list(range(start_row, end_row)), + "b": [f"s-{i}" for i in range(start_row, end_row)], + "c": [float(i + 0.1) for i in range(start_row, end_row)], + } + batch = pa.record_batch(data) + batches.append(batch) + + # Register as record batches to maintain multi-batch structure + # Using [batches] wraps list in another list as required by register_record_batches + ctx.register_record_batches("large_multi_batch_data", [batches]) + return ctx.table("large_multi_batch_data") + + @pytest.fixture def struct_df(): ctx = SessionContext() @@ -257,10 +295,17 @@ def test_drop_quoted_columns(): ctx = SessionContext() batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], names=["ID_For_Students"]) df = ctx.create_dataframe([[batch]]) - - # Both should work + # here we must quote to match the original column name assert df.drop('"ID_For_Students"').schema().names == [] - assert df.drop("ID_For_Students").schema().names == [] + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], names=["a", "b"] + ) + df = ctx.create_dataframe([[batch]]) + # with a lower case column, both 'a' and '"a"' work + assert df.drop("a").schema().names == ["b"] + df = ctx.create_dataframe([[batch]]) + assert df.drop('"a"').schema().names == ["b"] def test_select_mixed_expr_string(df): @@ -306,6 +351,29 @@ def test_filter(df): assert result.column(2) == pa.array([5]) +def test_filter_string_predicates(df): + df_str = df.filter("a > 2") + result = df_str.collect()[0] + + assert result.column(0) == pa.array([3]) + assert result.column(1) == pa.array([6]) + assert result.column(2) == pa.array([8]) + + df_mixed = df.filter("a > 1", column("b") != literal(6)) + result_mixed = df_mixed.collect()[0] + + assert result_mixed.column(0) == pa.array([2]) + assert result_mixed.column(1) == pa.array([5]) + assert result_mixed.column(2) == pa.array([5]) + + df_strings = df.filter("a > 1", "b < 6") + result_strings = df_strings.collect()[0] + + assert result_strings.column(0) == pa.array([2]) + assert result_strings.column(1) == pa.array([5]) + assert result_strings.column(2) == pa.array([5]) + + def test_parse_sql_expr(df): plan1 = df.filter(df.parse_sql_expr("a > 2")).logical_plan() plan2 = df.filter(column("a") > literal(2)).logical_plan() @@ -388,9 +456,16 @@ def test_aggregate_tuple_aggs(df): assert result_tuple == result_list -def test_filter_string_unsupported(df): - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.filter("a > 1") +def test_filter_string_equivalent(df): + df1 = df.filter("a > 1").to_pydict() + df2 = df.filter(column("a") > literal(1)).to_pydict() + assert df1 == df2 + + +def test_filter_string_invalid(df): + with pytest.raises(Exception) as excinfo: + df.filter("this is not valid sql").collect() + assert "Expected Expr" not in str(excinfo.value) def test_drop(df): @@ -447,8 +522,8 @@ def test_tail(df): assert result.column(2) == pa.array([8]) -def test_with_column(df): - df = df.with_column("c", column("a") + column("b")) +def test_with_column_sql_expression(df): + df = df.with_column("c", "a + b") # execute and collect the first (and only) batch result = df.collect()[0] @@ -462,11 +537,19 @@ def test_with_column(df): assert result.column(2) == pa.array([5, 7, 9]) -def test_with_column_invalid_expr(df): - with pytest.raises( - TypeError, match=r"Use col\(\)/column\(\) or lit\(\)/literal\(\)" - ): - df.with_column("c", "a") +def test_with_column(df): + df = df.with_column("c", column("a") + column("b")) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.schema.field(0).name == "a" + assert result.schema.field(1).name == "b" + assert result.schema.field(2).name == "c" + + assert result.column(0) == pa.array([1, 2, 3]) + assert result.column(1) == pa.array([4, 5, 6]) + assert result.column(2) == pa.array([5, 7, 9]) def test_with_columns(df): @@ -500,15 +583,35 @@ def test_with_columns(df): assert result.column(6) == pa.array([5, 7, 9]) -def test_with_columns_invalid_expr(df): - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns("a") - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns(c="a") - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns(["a"]) - with pytest.raises(TypeError, match=re.escape(EXPR_TYPE_ERROR)): - df.with_columns(c=["a"]) +def test_with_columns_str(df): + df = df.with_columns( + "a + b as c", + "a + b as d", + [ + "a + b as e", + "a + b as f", + ], + g="a + b", + ) + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.schema.field(0).name == "a" + assert result.schema.field(1).name == "b" + assert result.schema.field(2).name == "c" + assert result.schema.field(3).name == "d" + assert result.schema.field(4).name == "e" + assert result.schema.field(5).name == "f" + assert result.schema.field(6).name == "g" + + assert result.column(0) == pa.array([1, 2, 3]) + assert result.column(1) == pa.array([4, 5, 6]) + assert result.column(2) == pa.array([5, 7, 9]) + assert result.column(3) == pa.array([5, 7, 9]) + assert result.column(4) == pa.array([5, 7, 9]) + assert result.column(5) == pa.array([5, 7, 9]) + assert result.column(6) == pa.array([5, 7, 9]) def test_cast(df): @@ -520,6 +623,41 @@ def test_cast(df): assert df.schema() == expected +def test_iter_batches(df): + batches = [] + for batch in df: + batches.append(batch) # noqa: PERF402 + + # Delete DataFrame to ensure RecordBatches remain valid + del df + + assert len(batches) == 1 + + batch = batches[0] + assert isinstance(batch, RecordBatch) + pa_batch = batch.to_pyarrow() + assert pa_batch.column(0).to_pylist() == [1, 2, 3] + assert pa_batch.column(1).to_pylist() == [4, 5, 6] + assert pa_batch.column(2).to_pylist() == [8, 5, 8] + + +def test_iter_returns_datafusion_recordbatch(df): + for batch in df: + assert isinstance(batch, RecordBatch) + + +def test_execute_stream_basic(df): + stream = df.execute_stream() + batches = list(stream) + + assert len(batches) == 1 + assert isinstance(batches[0], RecordBatch) + pa_batch = batches[0].to_pyarrow() + assert pa_batch.column(0).to_pylist() == [1, 2, 3] + assert pa_batch.column(1).to_pylist() == [4, 5, 6] + assert pa_batch.column(2).to_pylist() == [8, 5, 8] + + def test_with_column_renamed(df): df = df.with_column("c", column("a") + column("b")).with_column_renamed("c", "sum") @@ -550,7 +688,6 @@ def test_unnest_without_nulls(nested_df): assert result.column(1) == pa.array([7, 8, 8, 9, 9, 9]) -@pytest.mark.filterwarnings("ignore:`join_keys`:DeprecationWarning") def test_join(): ctx = SessionContext() @@ -567,26 +704,41 @@ def test_join(): df1 = ctx.create_dataframe([[batch]], "r") df2 = df.join(df1, on="a", how="inner") - df2.show() - df2 = df2.sort(column("l.a")) + df2 = df2.sort(column("a")) table = pa.Table.from_batches(df2.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} assert table.to_pydict() == expected - df2 = df.join(df1, left_on="a", right_on="a", how="inner") - df2.show() - df2 = df2.sort(column("l.a")) - table = pa.Table.from_batches(df2.collect()) + # Test the default behavior for dropping duplicate keys + # Since we may have a duplicate column name and pa.Table() + # hides the fact, instead we need to explicitly check the + # resultant arrays. + df2 = df.join( + df1, left_on="a", right_on="a", how="inner", coalesce_duplicate_keys=True + ) + df2 = df2.sort(column("a")) + result = df2.collect()[0] + assert result.num_columns == 3 + assert result.column(0) == pa.array([1, 2], pa.int64()) + assert result.column(1) == pa.array([4, 5], pa.int64()) + assert result.column(2) == pa.array([8, 10], pa.int64()) - expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} - assert table.to_pydict() == expected + df2 = df.join( + df1, left_on="a", right_on="a", how="inner", coalesce_duplicate_keys=False + ) + df2 = df2.sort(column("l.a")) + result = df2.collect()[0] + assert result.num_columns == 4 + assert result.column(0) == pa.array([1, 2], pa.int64()) + assert result.column(1) == pa.array([4, 5], pa.int64()) + assert result.column(2) == pa.array([1, 2], pa.int64()) + assert result.column(3) == pa.array([8, 10], pa.int64()) # Verify we don't make a breaking change to pre-43.0.0 # where users would pass join_keys as a positional argument df2 = df.join(df1, (["a"], ["a"]), how="inner") - df2.show() - df2 = df2.sort(column("l.a")) + df2 = df2.sort(column("a")) table = pa.Table.from_batches(df2.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} @@ -611,7 +763,7 @@ def test_join_invalid_params(): with pytest.deprecated_call(): df2 = df.join(df1, join_keys=(["a"], ["a"]), how="inner") df2.show() - df2 = df2.sort(column("l.a")) + df2 = df2.sort(column("a")) table = pa.Table.from_batches(df2.collect()) expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} @@ -669,6 +821,35 @@ def test_join_on(): assert table.to_pydict() == expected +def test_join_full_with_drop_duplicate_keys(): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 3, 5, 7, 9]), pa.array([True, True, True, True, True])], + names=["log_time", "key_frame"], + ) + key_frame = ctx.create_dataframe([[batch]]) + + batch = pa.RecordBatch.from_arrays( + [pa.array([2, 4, 6, 8, 10])], + names=["log_time"], + ) + query_times = ctx.create_dataframe([[batch]]) + + merged = query_times.join( + key_frame, + left_on="log_time", + right_on="log_time", + how="full", + coalesce_duplicate_keys=True, + ) + merged = merged.sort(column("log_time")) + result = merged.collect()[0] + + assert result.num_columns == 2 + assert result.column(0).to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + def test_join_on_invalid_expr(): ctx = SessionContext() @@ -982,33 +1163,14 @@ def test_invalid_window_frame(units, start_bound, end_bound): def test_window_frame_defaults_match_postgres(partitioned_df): - # ref: https://github.com/apache/datafusion-python/issues/688 - - window_frame = WindowFrame("rows", None, None) - col_a = column("a") - # Using `f.window` with or without an unbounded window_frame produces the same - # results. These tests are included as a regression check but can be removed when - # f.window() is deprecated in favor of using the .over() approach. - no_frame = f.window("avg", [col_a]).alias("no_frame") - with_frame = f.window("avg", [col_a], window_frame=window_frame).alias("with_frame") - df_1 = partitioned_df.select(col_a, no_frame, with_frame) - - expected = { - "a": [0, 1, 2, 3, 4, 5, 6], - "no_frame": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], - "with_frame": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], - } - - assert df_1.sort(col_a).to_pydict() == expected - # When order is not set, the default frame should be unbounded preceding to # unbounded following. When order is set, the default frame is unbounded preceding # to current row. no_order = f.avg(col_a).over(Window()).alias("over_no_order") with_order = f.avg(col_a).over(Window(order_by=[col_a])).alias("over_with_order") - df_2 = partitioned_df.select(col_a, no_order, with_order) + df = partitioned_df.select(col_a, no_order, with_order) expected = { "a": [0, 1, 2, 3, 4, 5, 6], @@ -1016,7 +1178,7 @@ def test_window_frame_defaults_match_postgres(partitioned_df): "over_with_order": [0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0], } - assert df_2.sort(col_a).to_pydict() == expected + assert df.sort(col_a).to_pydict() == expected def _build_last_value_df(df): @@ -1316,7 +1478,7 @@ def get_header_style(self) -> str: def test_html_formatter_memory(df, clean_formatter_state): """Test the memory and row control parameters in DataFrameHtmlFormatter.""" - configure_formatter(max_memory_bytes=10, min_rows_display=1) + configure_formatter(max_memory_bytes=10, min_rows=1) html_output = df._repr_html_() # Count the number of table rows in the output @@ -1326,7 +1488,7 @@ def test_html_formatter_memory(df, clean_formatter_state): assert tr_count == 2 # 1 for header row, 1 for data row assert "data truncated" in html_output.lower() - configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1) + configure_formatter(max_memory_bytes=10 * MB, min_rows=1) html_output = df._repr_html_() # With larger memory limit and min_rows=2, should display all rows tr_count = count_table_rows(html_output) @@ -1336,15 +1498,136 @@ def test_html_formatter_memory(df, clean_formatter_state): assert "data truncated" not in html_output.lower() -def test_html_formatter_repr_rows(df, clean_formatter_state): - configure_formatter(min_rows_display=2, repr_rows=2) +def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_state): + """Test memory limit behavior at boundary conditions with large dataset. + + This test validates that the formatter correctly handles edge cases when + the memory limit is reached with a large dataset (100,000 rows), ensuring + that min_rows constraint is properly respected while respecting memory limits. + Uses large_df to actually test memory limit behavior with realistic data sizes. + """ + + # Get the raw size of the data to test boundary conditions + # First, capture output with no limits + # NOTE: max_rows=200000 is set well above the dataset size (100k rows) to ensure + # we're testing memory limits, not row limits. Default max_rows=10 would + # truncate before memory limit is reached. + configure_formatter(max_memory_bytes=10 * MB, min_rows=1, max_rows=200000) + unrestricted_output = large_df._repr_html_() + unrestricted_rows = count_table_rows(unrestricted_output) + + # Test 1: Very small memory limit should still respect min_rows + # With large dataset, this should definitely hit memory limit before min_rows + configure_formatter(max_memory_bytes=10, min_rows=1) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 2 # At least header + 1 data row (minimum) + # Should show truncation since we limited memory so aggressively + assert "data truncated" in html_output.lower() + + # Test 2: Memory limit at default size (2MB) should truncate the large dataset + # Default max_rows would truncate at 10 rows, so we don't set it here to test + # that memory limit is respected even with default row limit + configure_formatter(max_memory_bytes=2 * MB, min_rows=1) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 2 # At least header + min_rows + # Should be truncated since full dataset is much larger than 2MB + assert tr_count < unrestricted_rows + + # Test 3: Very large memory limit should show much more data + # NOTE: max_rows=200000 is critical here - without it, default max_rows=10 + # would limit output to 10 rows even though we have 100MB of memory available + configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + # Should show significantly more rows, possibly all + assert tr_count > 100 # Should show substantially more rows + + # Test 4: Min rows should override memory limit + # With tiny memory and larger min_rows, min_rows should win + configure_formatter(max_memory_bytes=10, min_rows=2) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 3 # At least header + 2 data rows (min_rows) + # Should show truncation message despite min_rows being satisfied + assert "data truncated" in html_output.lower() + + # Test 5: With reasonable memory and min_rows settings + # NOTE: max_rows=200000 ensures we test memory limit behavior, not row limit + configure_formatter(max_memory_bytes=2 * MB, min_rows=10, max_rows=200000) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 11 # header + at least 10 data rows (min_rows) + # Should be truncated due to memory limit + assert tr_count < unrestricted_rows + + +def test_html_formatter_stream_early_termination( + large_multi_batch_df, clean_formatter_state +): + """Test that memory limits cause early stream termination with multi-batch data. + + This test specifically validates that the formatter stops collecting data when + the memory limit is reached, rather than collecting all data and then truncating. + The large_multi_batch_df fixture creates 10 record batches, allowing us to verify + that not all batches are consumed when memory limit is hit. + + Key difference from test_html_formatter_memory_boundary_conditions: + - Uses multi-batch DataFrame to verify stream termination behavior + - Tests with memory limit exceeded by 2-3 batches but not 1 batch + - Verifies partial data + truncation message + respects min_rows + """ + + # Get baseline: how much data fits without memory limit + configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000) + unrestricted_output = large_multi_batch_df._repr_html_() + unrestricted_rows = count_table_rows(unrestricted_output) + + # Test 1: Memory limit exceeded by ~2 batches (each batch ~10k rows) + # With 1 batch (~1-2MB), we should have space. With 2-3 batches, we exceed limit. + # Set limit to ~3MB to ensure we collect ~1 batch before hitting limit + configure_formatter(max_memory_bytes=3 * MB, min_rows=1, max_rows=200000) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + # Should show significant truncation (not all 100k rows) + assert tr_count < unrestricted_rows, "Should be truncated by memory limit" + assert tr_count >= 2, "Should respect min_rows" + assert "data truncated" in html_output.lower(), "Should indicate truncation" + + # Test 2: Very tight memory limit should still respect min_rows + # Even with tiny memory (10 bytes), should show at least min_rows + configure_formatter(max_memory_bytes=10, min_rows=5, max_rows=200000) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + assert tr_count >= 6, "Should show header + at least min_rows (5)" + assert "data truncated" in html_output.lower(), "Should indicate truncation" + + # Test 3: Memory limit should take precedence over max_rows in early termination + # With max_rows=100 but small memory limit, should terminate early due to memory + configure_formatter(max_memory_bytes=2 * MB, min_rows=1, max_rows=100) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + # Should be truncated by memory limit (showing more than max_rows would suggest + # but less than unrestricted) + assert tr_count >= 2, "Should respect min_rows" + assert tr_count < unrestricted_rows, "Should be truncated" + # Output should indicate why truncation occurred + assert "data truncated" in html_output.lower() + + +def test_html_formatter_max_rows(df, clean_formatter_state): + configure_formatter(min_rows=2, max_rows=2) html_output = df._repr_html_() tr_count = count_table_rows(html_output) # Table should have header row (1) + 2 data rows = 3 rows assert tr_count == 3 - configure_formatter(min_rows_display=2, repr_rows=3) + configure_formatter(min_rows=2, max_rows=3) html_output = df._repr_html_() tr_count = count_table_rows(html_output) @@ -1370,17 +1653,42 @@ def test_html_formatter_validation(): with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): DataFrameHtmlFormatter(max_memory_bytes=-100) - with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): - DataFrameHtmlFormatter(min_rows_display=0) + with pytest.raises(ValueError, match="min_rows must be a positive integer"): + DataFrameHtmlFormatter(min_rows=0) + + with pytest.raises(ValueError, match="min_rows must be a positive integer"): + DataFrameHtmlFormatter(min_rows=-5) + + with pytest.raises(ValueError, match="max_rows must be a positive integer"): + DataFrameHtmlFormatter(max_rows=0) + + with pytest.raises(ValueError, match="max_rows must be a positive integer"): + DataFrameHtmlFormatter(max_rows=-10) + + with pytest.raises( + ValueError, match="min_rows must be less than or equal to max_rows" + ): + DataFrameHtmlFormatter(min_rows=5, max_rows=4) - with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): - DataFrameHtmlFormatter(min_rows_display=-5) - with pytest.raises(ValueError, match="repr_rows must be a positive integer"): - DataFrameHtmlFormatter(repr_rows=0) +def test_repr_rows_backward_compatibility(clean_formatter_state): + """Test that repr_rows parameter still works as deprecated alias.""" + # Should work when not conflicting with max_rows + with pytest.warns(DeprecationWarning, match="repr_rows parameter is deprecated"): + formatter = DataFrameHtmlFormatter(repr_rows=15, min_rows=10) + assert formatter.max_rows == 15 + assert formatter.repr_rows == 15 - with pytest.raises(ValueError, match="repr_rows must be a positive integer"): - DataFrameHtmlFormatter(repr_rows=-10) + # Should fail when conflicting with max_rows + with pytest.raises(ValueError, match="Cannot specify both repr_rows and max_rows"): + DataFrameHtmlFormatter(repr_rows=5, max_rows=10) + + # Setting repr_rows via property should warn + formatter2 = DataFrameHtmlFormatter() + with pytest.warns(DeprecationWarning, match="repr_rows is deprecated"): + formatter2.repr_rows = 7 + assert formatter2.max_rows == 7 + assert formatter2.repr_rows == 7 def test_configure_formatter(df, clean_formatter_state): @@ -1392,8 +1700,8 @@ def test_configure_formatter(df, clean_formatter_state): max_width = 500 max_height = 30 max_memory_bytes = 3 * MB - min_rows_display = 2 - repr_rows = 2 + min_rows = 2 + max_rows = 2 enable_cell_expansion = False show_truncation_message = False use_shared_styles = False @@ -1405,8 +1713,8 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_default.max_width != max_width assert formatter_default.max_height != max_height assert formatter_default.max_memory_bytes != max_memory_bytes - assert formatter_default.min_rows_display != min_rows_display - assert formatter_default.repr_rows != repr_rows + assert formatter_default.min_rows != min_rows + assert formatter_default.max_rows != max_rows assert formatter_default.enable_cell_expansion != enable_cell_expansion assert formatter_default.show_truncation_message != show_truncation_message assert formatter_default.use_shared_styles != use_shared_styles @@ -1417,8 +1725,8 @@ def test_configure_formatter(df, clean_formatter_state): max_width=max_width, max_height=max_height, max_memory_bytes=max_memory_bytes, - min_rows_display=min_rows_display, - repr_rows=repr_rows, + min_rows=min_rows, + max_rows=max_rows, enable_cell_expansion=enable_cell_expansion, show_truncation_message=show_truncation_message, use_shared_styles=use_shared_styles, @@ -1428,8 +1736,8 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_custom.max_width == max_width assert formatter_custom.max_height == max_height assert formatter_custom.max_memory_bytes == max_memory_bytes - assert formatter_custom.min_rows_display == min_rows_display - assert formatter_custom.repr_rows == repr_rows + assert formatter_custom.min_rows == min_rows + assert formatter_custom.max_rows == max_rows assert formatter_custom.enable_cell_expansion == enable_cell_expansion assert formatter_custom.show_truncation_message == show_truncation_message assert formatter_custom.use_shared_styles == use_shared_styles @@ -1544,7 +1852,6 @@ def test_execution_plan(aggregate_df): # indent plan will be different for everyone due to absolute path # to filename, so we just check for some expected content assert "AggregateExec:" in indent - assert "CoalesceBatchesExec:" in indent assert "RepartitionExec:" in indent assert "DataSourceExec:" in indent assert "file_type=csv" in indent @@ -1569,7 +1876,7 @@ def test_execution_plan(aggregate_df): @pytest.mark.asyncio async def test_async_iteration_of_df(aggregate_df): rows_returned = 0 - async for batch in aggregate_df.execute_stream(): + async for batch in aggregate_df: assert batch is not None rows_returned += len(batch.to_pyarrow()[0]) @@ -1584,6 +1891,14 @@ def test_repartition_by_hash(df): df.repartition_by_hash(column("a"), num=2) +def test_repartition_by_hash_sql_expression(df): + df.repartition_by_hash("a", num=2) + + +def test_repartition_by_hash_mix(df): + df.repartition_by_hash(column("a"), "b", num=2) + + def test_intersect(): ctx = SessionContext() @@ -1647,6 +1962,18 @@ def test_collect_partitioned(): assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned() +def test_collect_column(ctx: SessionContext): + batch_1 = pa.RecordBatch.from_pydict({"a": [1, 2, 3]}) + batch_2 = pa.RecordBatch.from_pydict({"a": [4, 5, 6]}) + batch_3 = pa.RecordBatch.from_pydict({"a": [7, 8, 9]}) + + ctx.register_record_batches("t", [[batch_1, batch_2], [batch_3]]) + + result = ctx.table("t").sort(column("a")).collect_column("a") + expected = pa.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) + assert result == expected + + def test_union(ctx): batch = pa.RecordBatch.from_arrays( [pa.array([1, 2, 3]), pa.array([4, 5, 6])], @@ -1757,6 +2084,53 @@ def test_to_arrow_table(df): assert set(pyarrow_table.column_names) == {"a", "b", "c"} +def test_parquet_non_null_column_to_pyarrow(ctx, tmp_path): + path = tmp_path.joinpath("t.parquet") + + ctx.sql("create table t_(a int not null)").collect() + ctx.sql("insert into t_ values (1), (2), (3)").collect() + ctx.sql(f"copy (select * from t_) to '{path}'").collect() + + ctx.register_parquet("t", path) + pyarrow_table = ctx.sql("select max(a) as m from t").to_arrow_table() + assert pyarrow_table.to_pydict() == {"m": [3]} + + +def test_parquet_empty_batch_to_pyarrow(ctx, tmp_path): + path = tmp_path.joinpath("t.parquet") + + ctx.sql("create table t_(a int not null)").collect() + ctx.sql("insert into t_ values (1), (2), (3)").collect() + ctx.sql(f"copy (select * from t_) to '{path}'").collect() + + ctx.register_parquet("t", path) + pyarrow_table = ctx.sql("select * from t limit 0").to_arrow_table() + assert pyarrow_table.schema == pa.schema( + [ + pa.field("a", pa.int32(), nullable=False), + ] + ) + + +def test_parquet_null_aggregation_to_pyarrow(ctx, tmp_path): + path = tmp_path.joinpath("t.parquet") + + ctx.sql("create table t_(a int not null)").collect() + ctx.sql("insert into t_ values (1), (2), (3)").collect() + ctx.sql(f"copy (select * from t_) to '{path}'").collect() + + ctx.register_parquet("t", path) + pyarrow_table = ctx.sql( + "select max(a) as m from (select * from t where a < 0)" + ).to_arrow_table() + assert pyarrow_table.to_pydict() == {"m": [None]} + assert pyarrow_table.schema == pa.schema( + [ + pa.field("m", pa.int32(), nullable=True), + ] + ) + + def test_execute_stream(df): stream = df.execute_stream() assert all(batch is not None for batch in stream) @@ -1839,6 +2213,121 @@ def test_empty_to_arrow_table(df): assert set(pyarrow_table.column_names) == {"a", "b", "c"} +def test_iter_batches_dataframe(fail_collect): + ctx = SessionContext() + + batch1 = pa.record_batch([pa.array([1])], names=["a"]) + batch2 = pa.record_batch([pa.array([2])], names=["a"]) + df = ctx.create_dataframe([[batch1], [batch2]]) + + expected = [batch1, batch2] + results = [b.to_pyarrow() for b in df] + + assert len(results) == len(expected) + for exp in expected: + assert any(got.equals(exp) for got in results) + + +def test_arrow_c_stream_to_table_and_reader(fail_collect): + ctx = SessionContext() + + # Create a DataFrame with two separate record batches + batch1 = pa.record_batch([pa.array([1])], names=["a"]) + batch2 = pa.record_batch([pa.array([2])], names=["a"]) + df = ctx.create_dataframe([[batch1], [batch2]]) + + table = pa.Table.from_batches(batch.to_pyarrow() for batch in df) + batches = table.to_batches() + + assert len(batches) == 2 + expected = [batch1, batch2] + for exp in expected: + assert any(got.equals(exp) for got in batches) + assert table.schema == df.schema() + assert table.column("a").num_chunks == 2 + + reader = pa.RecordBatchReader.from_stream(df) + assert isinstance(reader, pa.RecordBatchReader) + reader_table = pa.Table.from_batches(reader) + expected = pa.Table.from_batches([batch1, batch2]) + assert reader_table.equals(expected) + + +def test_arrow_c_stream_order(): + ctx = SessionContext() + + batch1 = pa.record_batch([pa.array([1])], names=["a"]) + batch2 = pa.record_batch([pa.array([2])], names=["a"]) + + df = ctx.create_dataframe([[batch1, batch2]]) + + table = pa.Table.from_batches(batch.to_pyarrow() for batch in df) + expected = pa.Table.from_batches([batch1, batch2]) + + assert table.equals(expected) + col = table.column("a") + assert col.chunk(0)[0].as_py() == 1 + assert col.chunk(1)[0].as_py() == 2 + + +def test_arrow_c_stream_schema_selection(fail_collect): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2]), + pa.array([3, 4]), + pa.array([5, 6]), + ], + names=["a", "b", "c"], + ) + df = ctx.create_dataframe([[batch]]) + + requested_schema = pa.schema([("c", pa.int64()), ("a", pa.int64())]) + + c_schema = pa_cffi.ffi.new("struct ArrowSchema*") + address = int(pa_cffi.ffi.cast("uintptr_t", c_schema)) + requested_schema._export_to_c(address) + capsule_new = ctypes.pythonapi.PyCapsule_New + capsule_new.restype = ctypes.py_object + capsule_new.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p] + + reader = pa.RecordBatchReader.from_stream(df, schema=requested_schema) + + assert reader.schema == requested_schema + + batches = list(reader) + + assert len(batches) == 1 + expected_batch = pa.record_batch( + [pa.array([5, 6]), pa.array([1, 2])], names=["c", "a"] + ) + assert batches[0].equals(expected_batch) + + +def test_arrow_c_stream_schema_mismatch(fail_collect): + ctx = SessionContext() + + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2]), pa.array([3, 4])], names=["a", "b"] + ) + df = ctx.create_dataframe([[batch]]) + + bad_schema = pa.schema([("a", pa.string())]) + + c_schema = pa_cffi.ffi.new("struct ArrowSchema*") + address = int(pa_cffi.ffi.cast("uintptr_t", c_schema)) + bad_schema._export_to_c(address) + + capsule_new = ctypes.pythonapi.PyCapsule_New + capsule_new.restype = ctypes.py_object + capsule_new.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_void_p] + bad_capsule = capsule_new(ctypes.c_void_p(address), b"arrow_schema", None) + + with pytest.raises(Exception, match="Fail to merge schema"): + df.__arrow_c_stream__(bad_capsule) + + def test_to_pylist(df): # Convert datafusion dataframe to Python list pylist = df.to_pylist() @@ -2131,9 +2620,7 @@ def test_write_parquet_with_options_writer_version( @pytest.mark.parametrize("writer_version", ["1.2.3", "custom-version", "0"]) def test_write_parquet_with_options_wrong_writer_version(df, tmp_path, writer_version): """Test that invalid writer versions in Parquet throw an exception.""" - with pytest.raises( - Exception, match="Unknown or unsupported parquet writer version" - ): + with pytest.raises(Exception, match="Invalid parquet writer version"): df.write_parquet_with_options( tmp_path, ParquetWriterOptions(writer_version=writer_version) ) @@ -2310,7 +2797,7 @@ def test_write_parquet_with_options_encoding(tmp_path, encoding, data_types, res def test_write_parquet_with_options_unsupported_encoding(df, tmp_path, encoding): """Test that unsupported Parquet encodings do not work.""" # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519 - with pytest.raises(BaseException, match="Encoding .*? is not supported"): + with pytest.raises(BaseException, match=r"Encoding .*? is not supported"): df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding)) @@ -2347,11 +2834,11 @@ def test_write_parquet_with_options_bloom_filter(df, tmp_path): size_no_bloom_filter = 0 for file in path_no_bloom_filter.rglob("*.parquet"): - size_no_bloom_filter += os.path.getsize(file) + size_no_bloom_filter += Path(file).stat().st_size size_bloom_filter = 0 for file in path_bloom_filter.rglob("*.parquet"): - size_bloom_filter += os.path.getsize(file) + size_bloom_filter += Path(file).stat().st_size assert size_no_bloom_filter < size_bloom_filter @@ -2654,6 +3141,47 @@ def test_html_formatter_manual_format_html(clean_formatter_state): assert "