diff --git a/.asf.yaml b/.asf.yaml index f27975c84..cb0520c17 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -16,24 +16,31 @@ # under the License. notifications: - commits: commits@arrow.apache.org - issues: github@arrow.apache.org - pullrequests: github@arrow.apache.org + commits: commits@datafusion.apache.org + issues: github@datafusion.apache.org + pullrequests: github@datafusion.apache.org jira_options: link label worklog github: - description: "Apache Arrow DataFusion Python Bindings" - homepage: https://arrow.apache.org/datafusion + description: "Apache DataFusion Python Bindings" + homepage: https://datafusion.apache.org/python enabled_merge_buttons: squash: true merge: false rebase: false features: issues: true + protected_branches: + main: + required_status_checks: + # require branches to be up-to-date before merging + strict: true + # don't require any jobs to pass + contexts: [] staging: whoami: asf-staging - subdir: datafusion-python + subdir: python publish: whoami: asf-site - subdir: datafusion-python + subdir: python diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 000000000..af951327f --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,5 @@ +[target.x86_64-apple-darwin] +rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] + +[target.aarch64-apple-darwin] +rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..411e60291 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +.cargo +.github +.pytest_cache +ci +conda +dev +docs +examples +parquet +target +testing +venv \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fc8910766..455a0dc1a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,34 +15,248 @@ # specific language governing permissions and limitations # under the License. -name: Python Release Build +# Reusable workflow for running building +# This ensures the same tests run for both debug (PRs) and release (main/tags) builds + +name: Build + on: - pull_request: - branches: ["main"] - push: - tags: ["*-rc*"] - branches: ["branch-*"] + workflow_call: + inputs: + build_mode: + description: 'Build mode: debug or release' + required: true + type: string + run_wheels: + description: 'Whether to build distribution wheels' + required: false + type: boolean + default: false + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 jobs: + # ============================================ + # Linting Jobs + # ============================================ + lint-rust: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + with: + toolchain: "nightly" + components: rustfmt + + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + + - name: Check formatting + run: cargo +nightly fmt --all -- --check + + lint-python: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Install dependencies + run: uv sync --dev --no-install-package datafusion + + - name: Run Ruff + run: | + uv run --no-project ruff check --output-format=github python/ + uv run --no-project ruff format --check python/ + + - name: Run codespell + run: | + uv run --no-project codespell --toml pyproject.toml + + lint-toml: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Install taplo + uses: taiki-e/install-action@v2 + with: + tool: taplo-cli + + # if you encounter an error, try running 'taplo format' to fix the formatting automatically. + - name: Check Cargo.toml formatting + run: taplo format --check + + check-crates-patch: + if: inputs.build_mode == 'release' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Ensure [patch.crates-io] is empty + run: python3 dev/check_crates_patch.py + generate-license: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Install cargo-license + uses: taiki-e/install-action@v2 with: - profile: minimal - toolchain: stable - override: true + tool: cargo-license + - name: Generate license file - run: python ./dev/create_license.py - - uses: actions/upload-artifact@v3 + run: uv run --no-project python ./dev/create_license.py + + - uses: actions/upload-artifact@v6 with: name: python-wheel-license path: LICENSE.txt + # ============================================ + # Build - Linux x86_64 + # ============================================ + build-manylinux-x86_64: + needs: [generate-license, lint-rust, lint-python] + name: ManyLinux x86_64 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - run: rm LICENSE.txt + - name: Download LICENSE.txt + uses: actions/download-artifact@v7 + with: + name: python-wheel-license + path: . + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Build (release mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'release' + with: + target: x86_64-unknown-linux-gnu + manylinux: "2_28" + args: --release --strip --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Build (debug mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'debug' + with: + target: x86_64-unknown-linux-gnu + manylinux: "2_28" + args: --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Build FFI test library + uses: PyO3/maturin-action@v1 + with: + target: x86_64-unknown-linux-gnu + manylinux: "2_28" + working-directory: examples/datafusion-ffi-example + args: --out dist + rustup-components: rust-std + + - name: Archive wheels + uses: actions/upload-artifact@v6 + with: + name: dist-manylinux-x86_64 + path: dist/* + + - name: Archive FFI test wheel + uses: actions/upload-artifact@v6 + with: + name: test-ffi-manylinux-x86_64 + path: examples/datafusion-ffi-example/dist/* + + # ============================================ + # Build - Linux ARM64 + # ============================================ + build-manylinux-aarch64: + needs: [generate-license, lint-rust, lint-python] + name: ManyLinux arm64 + runs-on: ubuntu-24.04-arm + steps: + - uses: actions/checkout@v6 + + - run: rm LICENSE.txt + - name: Download LICENSE.txt + uses: actions/download-artifact@v7 + with: + name: python-wheel-license + path: . + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Build (release mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'release' + with: + target: aarch64-unknown-linux-gnu + manylinux: "2_28" + args: --release --strip --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Build (debug mode) + uses: PyO3/maturin-action@v1 + if: inputs.build_mode == 'debug' + with: + target: aarch64-unknown-linux-gnu + manylinux: "2_28" + args: --features protoc,substrait --out dist + rustup-components: rust-std + + - name: Archive wheels + uses: actions/upload-artifact@v6 + if: inputs.build_mode == 'release' + with: + name: dist-manylinux-aarch64 + path: dist/* + + # ============================================ + # Build - macOS arm64 / Windows + # ============================================ build-python-mac-win: - needs: [generate-license] - name: Mac/Win + needs: [generate-license, lint-rust, lint-python] + name: macOS arm64 & Windows runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -50,82 +264,136 @@ jobs: python-version: ["3.10"] os: [macos-latest, windows-latest] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 - - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - uses: actions-rs/toolchain@v1 - with: - toolchain: stable + - uses: dtolnay/rust-toolchain@stable - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v7 with: name: python-wheel-license path: . - - name: Build Python package - run: maturin build --release --strip + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + - name: Install Protoc + uses: arduino/setup-protoc@v3 + with: + version: "27.4" + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install dependencies + run: uv sync --dev --no-install-package datafusion + + # Run clippy BEFORE maturin so we can avoid rebuilding. The features must match + # exactly the features used by maturin. Linux maturin builds need to happen in a + # container so only run this for our mac runner. + - name: Run Clippy + if: matrix.os != 'windows-latest' + run: cargo clippy --no-deps --all-targets --features substrait -- -D warnings + + - name: Build Python package (release mode) + if: inputs.build_mode == 'release' + run: uv run --no-project maturin build --release --strip --features substrait + + - name: Build Python package (debug mode) + if: inputs.build_mode != 'release' + run: uv run --no-project maturin build --features substrait - name: List Windows wheels if: matrix.os == 'windows-latest' run: dir target\wheels\ + # since the runner is dynamic shellcheck (from actionlint) can't infer this is powershell + # so we specify it explicitly + shell: powershell - name: List Mac wheels if: matrix.os != 'windows-latest' run: find target/wheels/ - name: Archive wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v6 + if: inputs.build_mode == 'release' with: - name: dist + name: dist-${{ matrix.os }} path: target/wheels/* - build-manylinux: - needs: [generate-license] - name: Manylinux - runs-on: ubuntu-latest + # ============================================ + # Build - macOS x86_64 (release only) + # ============================================ + build-macos-x86_64: + if: inputs.build_mode == 'release' + needs: [generate-license, lint-rust, lint-python] + runs-on: macos-15-intel + strategy: + fail-fast: false + matrix: + python-version: ["3.10"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 + + - uses: dtolnay/rust-toolchain@stable + - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v7 with: name: python-wheel-license path: . - - run: cat LICENSE.txt + + - name: Cache Cargo + uses: Swatinem/rust-cache@v2 + with: + key: ${{ inputs.build_mode }} + + - uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + - name: Install Protoc - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v3 with: - version: '3.x' + version: "27.4" repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - env: - RUST_BACKTRACE: 1 - rust-toolchain: nightly - target: x86_64 - manylinux: auto - args: --release --manylinux 2014 + + - name: Install dependencies + run: uv sync --dev --no-install-package datafusion + + - name: Build (release mode) + run: | + uv run --no-project maturin build --release --strip --features substrait + + - name: List Mac wheels + run: find target/wheels/ + - name: Archive wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v6 with: - name: dist + name: dist-macos-aarch64 path: target/wheels/* + # ============================================ + # Build - Source Distribution + # ============================================ + build-sdist: needs: [generate-license] name: Source distribution + if: inputs.build_mode == 'release' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 - run: rm LICENSE.txt - name: Download LICENSE.txt - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v7 with: name: python-wheel-license path: . @@ -135,22 +403,132 @@ jobs: with: rust-toolchain: stable manylinux: auto - args: --release --sdist --out dist - - name: Archive wheels - uses: actions/upload-artifact@v3 + rustup-components: rust-std rustfmt + args: --release --sdist --out dist --features protoc,substrait + - name: Assert sdist build does not generate wheels + run: | + if [ "$(ls -A target/wheels)" ]; then + echo "Error: Sdist build generated wheels" + exit 1 + else + echo "Directory is clean" + fi + shell: bash + + # ============================================ + # Build - Source Distribution + # ============================================ + + merge-build-artifacts: + runs-on: ubuntu-latest + name: Merge build artifacts + if: inputs.build_mode == 'release' + needs: + - build-python-mac-win + - build-macos-x86_64 + - build-manylinux-x86_64 + - build-manylinux-aarch64 + - build-sdist + steps: + - name: Merge Build Artifacts + uses: actions/upload-artifact/merge@v6 with: name: dist - path: target/wheels/* + pattern: dist-* + + # ============================================ + # Build - Documentation + # ============================================ + # Documentation build job that runs after wheels are built + build-docs: + name: Build docs + runs-on: ubuntu-latest + needs: [build-manylinux-x86_64] # Only need the Linux wheel for docs + # Only run docs on main branch pushes, tags, or PRs + if: github.event_name == 'push' || github.event_name == 'pull_request' + steps: + - name: Set target branch + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + id: target-branch + run: | + set -x + if test '${{ github.ref }}' = 'refs/heads/main'; then + echo "value=asf-staging" >> "$GITHUB_OUTPUT" + elif test '${{ github.ref_type }}' = 'tag'; then + echo "value=asf-site" >> "$GITHUB_OUTPUT" + else + echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" + exit 1 + fi + + - name: Checkout docs sources + uses: actions/checkout@v6 + + - name: Checkout docs target branch + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ steps.target-branch.outputs.value }} + path: docs-target + + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: "3.10" + + - name: Install dependencies + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + # Download the Linux wheel built in the previous job + - name: Download pre-built Linux wheel + uses: actions/download-artifact@v7 + with: + name: dist-manylinux-x86_64 + path: wheels/ + + # Install from the pre-built wheels + - name: Install from pre-built wheels + run: | + set -x + uv venv + # Install documentation dependencies + uv sync --dev --no-install-package datafusion --group docs + # Install all pre-built wheels + WHEELS=$(find wheels/ -name "*.whl") + if [ -n "$WHEELS" ]; then + echo "Installing wheels:" + echo "$WHEELS" + uv pip install wheels/*.whl + else + echo "ERROR: No wheels found!" + exit 1 + fi + + - name: Build docs + run: | + set -x + cd docs + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet + uv run --no-project make html - # NOTE: PyPI publish needs to be done manually for now after release passed the vote - # release: - # name: Publish in PyPI - # needs: [build-manylinux, build-python-mac-win] - # runs-on: ubuntu-latest - # steps: - # - uses: actions/download-artifact@v3 - # - name: Publish to PyPI - # uses: pypa/gh-action-pypi-publish@master - # with: - # user: __token__ - # password: ${{ secrets.pypi_password }} + - name: Copy & push the generated HTML + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') + run: | + set -x + cd docs-target + # delete anything but: 1) '.'; 2) '..'; 3) .git/ + find ./ | grep -vE "^./$|^../$|^./.git" | xargs rm -rf + cp ../.asf.yaml . + cp -r ../docs/build/html/* . + git status --porcelain + if [ "$(git status --porcelain)" != "" ]; then + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add --all + git commit -m 'Publish built docs triggered by ${{ github.sha }}' + git push || git push --force + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..ab284b522 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# CI workflow for pull requests - runs tests in DEBUG mode for faster feedback + +name: CI + +on: + pull_request: + branches: ["main"] + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + build: + uses: ./.github/workflows/build.yml + with: + build_mode: debug + run_wheels: false + secrets: inherit + + test: + needs: build + uses: ./.github/workflows/test.yml + secrets: inherit diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 05cf8ce68..2c8ecbc5e 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -25,10 +25,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: - python-version: "3.10" + python-version: "3.14" - name: Audit licenses run: ./dev/release/run-rat.sh . diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml deleted file mode 100644 index b1c9ffc12..000000000 --- a/.github/workflows/docs.yaml +++ /dev/null @@ -1,75 +0,0 @@ -on: - push: - branches: - - master - tags-ignore: - - "**-rc**" - -name: Deploy DataFusion Python site - -jobs: - build-docs: - name: Build docs - runs-on: ubuntu-latest - steps: - - name: Set target branch - id: target-branch - run: | - set -x - if test '${{ github.ref }}' = 'refs/heads/main'; then - echo "value=asf-staging" >> $GITHUB_OUTPUT - elif test '${{ github.ref_type }}' = 'tag'; then - echo "value=asf-site" >> $GITHUB_OUTPUT - else - echo "Unsupported input: ${{ github.ref }} / ${{ github.ref_type }}" - exit 1 - fi - - name: Checkout docs sources - uses: actions/checkout@v3 - - name: Checkout docs target branch - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ steps.target-branch.outputs.value }} - path: docs-target - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Install dependencies - run: | - set -x - python3 -m venv venv - source venv/bin/activate - pip install -r requirements-310.txt - pip install -r docs/requirements.txt - - name: Build Datafusion - run: | - set -x - source venv/bin/activate - maturin develop - - - name: Build docs - run: | - set -x - source venv/bin/activate - cd docs - make html - - - name: Copy & push the generated HTML - run: | - set -x - cd docs-target - # delete anything but: 1) '.'; 2) '..'; 3) .git/ - find ./ | grep -vE "^./$|^../$|^./.git" | xargs rm -rf - cp ../.asf.yaml . - cp -r ../docs/build/html/* . - git status --porcelain - if [ "$(git status --porcelain)" != "" ]; then - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add --all - git commit -m 'Publish built docs triggered by ${{ github.sha }}' - git push || git push --force - fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..bddc89eac --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Release workflow - runs tests in RELEASE mode and builds distribution wheels +# Triggered on: +# - Merges to main +# - Release candidate tags (*-rc*) +# - Release tags (e.g., 45.0.0) + +name: Release Build + +on: + push: + branches: + - "main" + tags: + - "*-rc*" # Release candidates (e.g., 45.0.0-rc1) + - "[0-9]+.*" # Release tags (e.g., 45.0.0) + +concurrency: + group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + build: + uses: ./.github/workflows/build.yml + with: + build_mode: release + run_wheels: true + secrets: inherit + + test: + needs: build + uses: ./.github/workflows/test.yml + secrets: inherit diff --git a/.github/workflows/take.yml b/.github/workflows/take.yml new file mode 100644 index 000000000..86dc190ad --- /dev/null +++ b/.github/workflows/take.yml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Assign the issue via a `take` comment +on: + issue_comment: + types: created + +permissions: + issues: write + +jobs: + issue_assign: + runs-on: ubuntu-latest + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' + concurrency: + group: ${{ github.actor }}-issue-assign + steps: + - run: | + CODE=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -LI https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees/${{ github.event.comment.user.login }} -o /dev/null -w '%{http_code}\n' -s) + if [ "$CODE" -eq "204" ] + then + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + else + echo "Cannot assign issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + fi \ No newline at end of file diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml deleted file mode 100644 index 164b09e15..000000000 --- a/.github/workflows/test.yaml +++ /dev/null @@ -1,115 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Python test -on: - push: - branches: [main] - pull_request: - branches: [main] - -concurrency: - group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} - cancel-in-progress: true - -jobs: - test-matrix: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: - - "3.10" - toolchain: - - "stable" - - "beta" - # we are not that much eager in walking on the edge yet - # - nightly - # build stable for only 3.7 - include: - - python-version: "3.7" - toolchain: "stable" - steps: - - uses: actions/checkout@v3 - - - name: Setup Rust Toolchain - uses: actions-rs/toolchain@v1 - id: rust-toolchain - with: - toolchain: ${{ matrix.toolchain }} - override: true - - - name: Install Protoc - uses: arduino/setup-protoc@v1 - with: - version: '3.x' - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: ~/.cargo - key: cargo-cache-${{ steps.rust-toolchain.outputs.rustc_hash }}-${{ hashFiles('Cargo.lock') }} - - - name: Check Formatting - uses: actions-rs/cargo@v1 - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - with: - command: fmt - args: -- --check - - - name: Run Clippy - uses: actions-rs/cargo@v1 - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - with: - command: clippy - args: --all-targets --all-features -- -D clippy::all -A clippy::redundant_closure - - - name: Create Virtualenv (3.10) - if: ${{ matrix.python-version == '3.10' }} - run: | - python -m venv venv - source venv/bin/activate - pip install -r requirements-310.txt - - - name: Create Virtualenv (3.7) - if: ${{ matrix.python-version == '3.7' }} - run: | - python -m venv venv - source venv/bin/activate - pip install -r requirements-37.txt - - - name: Run Python Linters - if: ${{ matrix.python-version == '3.10' && matrix.toolchain == 'stable' }} - run: | - source venv/bin/activate - flake8 --exclude venv --ignore=E501,W503 - black --line-length 79 --diff --check . - - - name: Run tests - env: - RUST_BACKTRACE: 1 - run: | - git submodule update --init - source venv/bin/activate - pip install -e . -vv - pytest -v . diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..692563019 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,133 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Reusable workflow for running tests +# This ensures the same tests run for both debug (PRs) and release (main/tags) builds + +name: Test + +on: + workflow_call: + +jobs: + test-matrix: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: + - "3.10" + - "3.11" + - "3.12" + - "3.13" + - "3.14" + toolchain: + - "stable" + + steps: + - uses: actions/checkout@v6 + + - name: Verify example datafusion version + run: | + MAIN_VERSION=$(grep -A 1 "name = \"datafusion-common\"" Cargo.lock | grep "version = " | head -1 | sed 's/.*version = "\(.*\)"/\1/') + EXAMPLE_VERSION=$(grep -A 1 "name = \"datafusion-common\"" examples/datafusion-ffi-example/Cargo.lock | grep "version = " | head -1 | sed 's/.*version = "\(.*\)"/\1/') + echo "Main crate datafusion version: $MAIN_VERSION" + echo "FFI example datafusion version: $EXAMPLE_VERSION" + + if [ "$MAIN_VERSION" != "$EXAMPLE_VERSION" ]; then + echo "❌ Error: FFI example datafusion versions don't match!" + exit 1 + fi + + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache Cargo + uses: actions/cache@v5 + with: + path: ~/.cargo + key: cargo-cache-${{ matrix.toolchain }}-${{ hashFiles('Cargo.lock') }} + + - name: Install dependencies + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + # Download the Linux wheel built in the build workflow + - name: Download pre-built Linux wheel + uses: actions/download-artifact@v7 + with: + name: dist-manylinux-x86_64 + path: wheels/ + + # Download the FFI test wheel + - name: Download pre-built FFI test wheel + uses: actions/download-artifact@v7 + with: + name: test-ffi-manylinux-x86_64 + path: wheels/ + + # Install from the pre-built wheels + - name: Install from pre-built wheels + run: | + set -x + uv venv + # Install development dependencies + uv sync --dev --no-install-package datafusion + # Install all pre-built wheels + WHEELS=$(find wheels/ -name "*.whl") + if [ -n "$WHEELS" ]; then + echo "Installing wheels:" + echo "$WHEELS" + uv pip install wheels/*.whl + else + echo "ERROR: No wheels found!" + exit 1 + fi + + - name: Run tests + env: + RUST_BACKTRACE: 1 + run: | + git submodule update --init + uv run --no-project pytest -v --import-mode=importlib + + - name: FFI unit tests + run: | + cd examples/datafusion-ffi-example + uv run --no-project pytest python/tests/_test*.py + + - name: Cache the generated dataset + id: cache-tpch-dataset + uses: actions/cache@v5 + with: + path: benchmarks/tpch/data + key: tpch-data-2.18.0 + + - name: Run dbgen to create 1 Gb dataset + if: ${{ steps.cache-tpch-dataset.outputs.cache-hit != 'true' }} + run: | + cd benchmarks/tpch + RUN_IN_CI=TRUE ./tpch-gen.sh 1 + + - name: Run TPC-H examples + run: | + cd examples/tpch + uv run --no-project python convert_data_to_parquet.py + uv run --no-project pytest _tests.py diff --git a/.github/workflows/verify-release-candidate.yml b/.github/workflows/verify-release-candidate.yml new file mode 100644 index 000000000..a10a4faa9 --- /dev/null +++ b/.github/workflows/verify-release-candidate.yml @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Verify Release Candidate + +# NOTE: This workflow is intended to be run manually via workflow_dispatch. + +on: + workflow_dispatch: + inputs: + version: + description: Version number (e.g., 52.0.0) + required: true + type: string + rc_number: + description: Release candidate number (e.g., 0) + required: true + type: string + +concurrency: + group: ${{ github.repository }}-${{ github.ref }}-${{ github.workflow }} + cancel-in-progress: true + +jobs: + verify: + name: Verify RC (${{ matrix.os }}-${{ matrix.arch }}) + strategy: + fail-fast: false + matrix: + include: + # Linux + - os: linux + arch: x64 + runner: ubuntu-latest + - os: linux + arch: arm64 + runner: ubuntu-24.04-arm + + # macOS + - os: macos + arch: arm64 + runner: macos-latest + - os: macos + arch: x64 + runner: macos-15-intel + + # Windows + - os: windows + arch: x64 + runner: windows-latest + runs-on: ${{ matrix.runner }} + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set up protoc + uses: arduino/setup-protoc@v3 + with: + version: "27.4" + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Run release candidate verification + shell: bash + run: ./dev/release/verify-release-candidate.sh "${{ inputs.version }}" "${{ inputs.rc_number }}" diff --git a/.gitignore b/.gitignore index 4e4450082..614d82327 100644 --- a/.gitignore +++ b/.gitignore @@ -4,22 +4,35 @@ target /docs/temp /docs/build .DS_Store +.vscode # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class +# Python dist ignore +dist + # C extensions *.so +# Python dist +dist + # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: .python-version venv +.venv apache-rat-*.jar *rat.txt .env -CHANGELOG.md.bak \ No newline at end of file +CHANGELOG.md.bak + +docs/mdbook/book + +.pyo3_build_config + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3c6805322..8ae6a4e32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,37 +16,42 @@ # under the License. repos: - - repo: https://github.com/psf/black - rev: 22.3.0 + - repo: https://github.com/rhysd/actionlint + rev: v1.7.6 hooks: - - id: black - files: datafusion/.* - # Explicitly specify the pyproject.toml at the repo root, not per-project. - args: ["--config", "pyproject.toml", "--line-length", "79", "--diff", "--check", "."] - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 + - id: actionlint-docker + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.15.1 hooks: - - id: flake8 - files: datafusion/.*$ - types: [file] - types_or: [python] - additional_dependencies: ["flake8-force"] + # Run the linter. + - id: ruff + # Run the formatter. + - id: ruff-format - repo: local hooks: - id: rust-fmt name: Rust fmt description: Run cargo fmt on files included in the commit. rustfmt should be installed before-hand. - entry: cargo fmt --all -- + entry: cargo +nightly fmt --all -- pass_filenames: true types: [file, rust] language: system - id: rust-clippy name: Rust clippy description: Run cargo clippy on files included in the commit. clippy should be installed before-hand. - entry: cargo clippy --all-targets --all-features -- -Dclippy::all -Aclippy::redundant_closure + entry: cargo clippy --all-targets --all-features -- -Dclippy::all -D warnings -Aclippy::redundant_closure pass_filenames: false types: [file, rust] language: system + - repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + args: [ --toml, "pyproject.toml"] + additional_dependencies: + - tomli + default_language_version: python: python3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cf05413a..ae40911d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,279 +17,6 @@ under the License. --> -# Changelog +# DataFusion Python Changelog -## [20.0.0](https://github.com/apache/arrow-datafusion-python/tree/20.0.0) (2023-03-17) - -[Full Changelog](https://github.com/apache/arrow-datafusion-python/compare/0.8.0...20.0.0) - -**Implemented enhancements:** - -- Empty relation bindings [#208](https://github.com/apache/arrow-datafusion-python/pull/208) (jdye64) -- wrap display_name and canonical_name functions [#214](https://github.com/apache/arrow-datafusion-python/pull/214) (jdye64) -- Add PyAlias bindings [#216](https://github.com/apache/arrow-datafusion-python/pull/216) (jdye64) -- Add bindings for scalar_variable [#218](https://github.com/apache/arrow-datafusion-python/pull/218) (jdye64) -- Bindings for LIKE type expressions [#220](https://github.com/apache/arrow-datafusion-python/pull/220) (jdye64) -- Bool expr bindings [#223](https://github.com/apache/arrow-datafusion-python/pull/223) (jdye64) -- Between bindings [#229](https://github.com/apache/arrow-datafusion-python/pull/229) (jdye64) -- Add bindings for GetIndexedField [#227](https://github.com/apache/arrow-datafusion-python/pull/227) (jdye64) -- Add bindings for case, cast, and trycast [#232](https://github.com/apache/arrow-datafusion-python/pull/232) (jdye64) -- add remaining expr bindings [#233](https://github.com/apache/arrow-datafusion-python/pull/233) (jdye64) -- feature: Additional export methods [#236](https://github.com/apache/arrow-datafusion-python/pull/236) (simicd) -- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/arrow-datafusion-python/pull/240) (iajoiner) -- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/arrow-datafusion-python/pull/242) (simicd) -- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/arrow-datafusion-python/pull/246) (iajoiner) -- feature: Set table name from ctx functions [#260](https://github.com/apache/arrow-datafusion-python/pull/260) (simicd) -- Explain bindings [#264](https://github.com/apache/arrow-datafusion-python/pull/264) (jdye64) -- Extension bindings [#266](https://github.com/apache/arrow-datafusion-python/pull/266) (jdye64) -- Subquery alias bindings [#269](https://github.com/apache/arrow-datafusion-python/pull/269) (jdye64) -- Create memory table [#271](https://github.com/apache/arrow-datafusion-python/pull/271) (jdye64) -- Create view bindings [#273](https://github.com/apache/arrow-datafusion-python/pull/273) (jdye64) -- Re-export Datafusion dependencies [#277](https://github.com/apache/arrow-datafusion-python/pull/277) (jdye64) -- Distinct bindings [#275](https://github.com/apache/arrow-datafusion-python/pull/275) (jdye64) -- Drop table bindings [#283](https://github.com/apache/arrow-datafusion-python/pull/283) (jdye64) -- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/arrow-datafusion-python/pull/285) (jdye64) -- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/arrow-datafusion-python/pull/287) (jdye64) - -**Documentation updates:** - -- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/arrow-datafusion-python/pull/258) (simicd) - -**Merged pull requests:** - -- Minor docs updates [#210](https://github.com/apache/arrow-datafusion-python/pull/210) (andygrove) -- Empty relation bindings [#208](https://github.com/apache/arrow-datafusion-python/pull/208) (jdye64) -- wrap display_name and canonical_name functions [#214](https://github.com/apache/arrow-datafusion-python/pull/214) (jdye64) -- Add PyAlias bindings [#216](https://github.com/apache/arrow-datafusion-python/pull/216) (jdye64) -- Add bindings for scalar_variable [#218](https://github.com/apache/arrow-datafusion-python/pull/218) (jdye64) -- Bindings for LIKE type expressions [#220](https://github.com/apache/arrow-datafusion-python/pull/220) (jdye64) -- Bool expr bindings [#223](https://github.com/apache/arrow-datafusion-python/pull/223) (jdye64) -- Between bindings [#229](https://github.com/apache/arrow-datafusion-python/pull/229) (jdye64) -- Add bindings for GetIndexedField [#227](https://github.com/apache/arrow-datafusion-python/pull/227) (jdye64) -- Add bindings for case, cast, and trycast [#232](https://github.com/apache/arrow-datafusion-python/pull/232) (jdye64) -- add remaining expr bindings [#233](https://github.com/apache/arrow-datafusion-python/pull/233) (jdye64) -- Pre-commit hooks [#228](https://github.com/apache/arrow-datafusion-python/pull/228) (jdye64) -- Implement new release process [#149](https://github.com/apache/arrow-datafusion-python/pull/149) (andygrove) -- feature: Additional export methods [#236](https://github.com/apache/arrow-datafusion-python/pull/236) (simicd) -- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/arrow-datafusion-python/pull/240) (iajoiner) -- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/arrow-datafusion-python/pull/242) (simicd) -- Fix release instructions [#238](https://github.com/apache/arrow-datafusion-python/pull/238) (andygrove) -- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/arrow-datafusion-python/pull/246) (iajoiner) -- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/arrow-datafusion-python/pull/258) (simicd) -- feature: Set table name from ctx functions [#260](https://github.com/apache/arrow-datafusion-python/pull/260) (simicd) -- Upgrade to DataFusion 19 [#262](https://github.com/apache/arrow-datafusion-python/pull/262) (andygrove) -- Explain bindings [#264](https://github.com/apache/arrow-datafusion-python/pull/264) (jdye64) -- Extension bindings [#266](https://github.com/apache/arrow-datafusion-python/pull/266) (jdye64) -- Subquery alias bindings [#269](https://github.com/apache/arrow-datafusion-python/pull/269) (jdye64) -- Create memory table [#271](https://github.com/apache/arrow-datafusion-python/pull/271) (jdye64) -- Create view bindings [#273](https://github.com/apache/arrow-datafusion-python/pull/273) (jdye64) -- Re-export Datafusion dependencies [#277](https://github.com/apache/arrow-datafusion-python/pull/277) (jdye64) -- Distinct bindings [#275](https://github.com/apache/arrow-datafusion-python/pull/275) (jdye64) -- build(deps): bump actions/checkout from 2 to 3 [#244](https://github.com/apache/arrow-datafusion-python/pull/244) (dependabot[bot]) -- build(deps): bump actions/upload-artifact from 2 to 3 [#245](https://github.com/apache/arrow-datafusion-python/pull/245) (dependabot[bot]) -- build(deps): bump actions/download-artifact from 2 to 3 [#243](https://github.com/apache/arrow-datafusion-python/pull/243) (dependabot[bot]) -- Use DataFusion 20 [#278](https://github.com/apache/arrow-datafusion-python/pull/278) (andygrove) -- Drop table bindings [#283](https://github.com/apache/arrow-datafusion-python/pull/283) (jdye64) -- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/arrow-datafusion-python/pull/285) (jdye64) -- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/arrow-datafusion-python/pull/287) (jdye64) - -## [0.8.0](https://github.com/apache/arrow-datafusion-python/tree/0.8.0) (2023-02-22) - -[Full Changelog](https://github.com/apache/arrow-datafusion-python/compare/0.8.0-rc1...0.8.0) - -**Implemented enhancements:** - -- Add support for cuDF physical execution engine [\#202](https://github.com/apache/arrow-datafusion-python/issues/202) -- Make it easier to create a Pandas dataframe from DataFusion query results [\#139](https://github.com/apache/arrow-datafusion-python/issues/139) - -**Fixed bugs:** - -- Build error: could not compile `thiserror` due to 2 previous errors [\#69](https://github.com/apache/arrow-datafusion-python/issues/69) - -**Closed issues:** - -- Integrate with the new `object_store` crate [\#22](https://github.com/apache/arrow-datafusion-python/issues/22) - -**Merged pull requests:** - -- Update README in preparation for 0.8 release [\#206](https://github.com/apache/arrow-datafusion-python/pull/206) ([andygrove](https://github.com/andygrove)) -- Add support for cudf as a physical execution engine [\#205](https://github.com/apache/arrow-datafusion-python/pull/205) ([jdye64](https://github.com/jdye64)) -- Run `maturin develop` instead of `cargo build` in verification script [\#200](https://github.com/apache/arrow-datafusion-python/pull/200) ([andygrove](https://github.com/andygrove)) -- Add tests for recently added functionality [\#199](https://github.com/apache/arrow-datafusion-python/pull/199) ([andygrove](https://github.com/andygrove)) -- Implement `to_pandas()` [\#197](https://github.com/apache/arrow-datafusion-python/pull/197) ([simicd](https://github.com/simicd)) -- Add Python wrapper for LogicalPlan::Sort [\#196](https://github.com/apache/arrow-datafusion-python/pull/196) ([andygrove](https://github.com/andygrove)) -- Add Python wrapper for LogicalPlan::Aggregate [\#195](https://github.com/apache/arrow-datafusion-python/pull/195) ([andygrove](https://github.com/andygrove)) -- Add Python wrapper for LogicalPlan::Limit [\#193](https://github.com/apache/arrow-datafusion-python/pull/193) ([andygrove](https://github.com/andygrove)) -- Add Python wrapper for LogicalPlan::Filter [\#192](https://github.com/apache/arrow-datafusion-python/pull/192) ([andygrove](https://github.com/andygrove)) -- Add experimental support for executing SQL with Polars and Pandas [\#190](https://github.com/apache/arrow-datafusion-python/pull/190) ([andygrove](https://github.com/andygrove)) -- Update changelog for 0.8 release [\#188](https://github.com/apache/arrow-datafusion-python/pull/188) ([andygrove](https://github.com/andygrove)) -- Add ability to execute ExecutionPlan and get a stream of RecordBatch [\#186](https://github.com/apache/arrow-datafusion-python/pull/186) ([andygrove](https://github.com/andygrove)) -- Dffield bindings [\#185](https://github.com/apache/arrow-datafusion-python/pull/185) ([jdye64](https://github.com/jdye64)) -- Add bindings for DFSchema [\#183](https://github.com/apache/arrow-datafusion-python/pull/183) ([jdye64](https://github.com/jdye64)) -- test: Window functions [\#182](https://github.com/apache/arrow-datafusion-python/pull/182) ([simicd](https://github.com/simicd)) -- Add bindings for Projection [\#180](https://github.com/apache/arrow-datafusion-python/pull/180) ([jdye64](https://github.com/jdye64)) -- Table scan bindings [\#178](https://github.com/apache/arrow-datafusion-python/pull/178) ([jdye64](https://github.com/jdye64)) -- Make session configurable [\#176](https://github.com/apache/arrow-datafusion-python/pull/176) ([andygrove](https://github.com/andygrove)) -- Upgrade to DataFusion 18.0.0 [\#175](https://github.com/apache/arrow-datafusion-python/pull/175) ([andygrove](https://github.com/andygrove)) -- Use latest DataFusion rev in preparation for DF 18 release [\#174](https://github.com/apache/arrow-datafusion-python/pull/174) ([andygrove](https://github.com/andygrove)) -- Arrow type bindings [\#173](https://github.com/apache/arrow-datafusion-python/pull/173) ([jdye64](https://github.com/jdye64)) -- Pyo3 bump [\#171](https://github.com/apache/arrow-datafusion-python/pull/171) ([jdye64](https://github.com/jdye64)) -- feature: Add additional aggregation functions [\#170](https://github.com/apache/arrow-datafusion-python/pull/170) ([simicd](https://github.com/simicd)) -- Make from\_substrait\_plan return DataFrame instead of LogicalPlan [\#164](https://github.com/apache/arrow-datafusion-python/pull/164) ([andygrove](https://github.com/andygrove)) -- feature: Implement count method [\#163](https://github.com/apache/arrow-datafusion-python/pull/163) ([simicd](https://github.com/simicd)) -- CI Fixes [\#162](https://github.com/apache/arrow-datafusion-python/pull/162) ([jdye64](https://github.com/jdye64)) -- Upgrade to DataFusion 17 [\#160](https://github.com/apache/arrow-datafusion-python/pull/160) ([andygrove](https://github.com/andygrove)) -- feature: Improve string representation of datafusion classes [\#159](https://github.com/apache/arrow-datafusion-python/pull/159) ([simicd](https://github.com/simicd)) -- Make PyExecutionPlan.plan public [\#156](https://github.com/apache/arrow-datafusion-python/pull/156) ([andygrove](https://github.com/andygrove)) -- Expose methods on logical and execution plans [\#155](https://github.com/apache/arrow-datafusion-python/pull/155) ([andygrove](https://github.com/andygrove)) -- Fix clippy for new Rust version [\#154](https://github.com/apache/arrow-datafusion-python/pull/154) ([andygrove](https://github.com/andygrove)) -- Add DataFrame methods for accessing plans [\#153](https://github.com/apache/arrow-datafusion-python/pull/153) ([andygrove](https://github.com/andygrove)) -- Use DataFusion rev 5238e8c97f998b4d2cb9fab85fb182f325a1a7fb [\#150](https://github.com/apache/arrow-datafusion-python/pull/150) ([andygrove](https://github.com/andygrove)) -- build\(deps\): bump async-trait from 0.1.61 to 0.1.62 [\#148](https://github.com/apache/arrow-datafusion-python/pull/148) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Rename default branch from master to main [\#147](https://github.com/apache/arrow-datafusion-python/pull/147) ([andygrove](https://github.com/andygrove)) -- Substrait bindings [\#145](https://github.com/apache/arrow-datafusion-python/pull/145) ([jdye64](https://github.com/jdye64)) -- build\(deps\): bump uuid from 0.8.2 to 1.2.2 [\#143](https://github.com/apache/arrow-datafusion-python/pull/143) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Prepare for 0.8.0 release [\#141](https://github.com/apache/arrow-datafusion-python/pull/141) ([andygrove](https://github.com/andygrove)) -- Improve README and add more examples [\#137](https://github.com/apache/arrow-datafusion-python/pull/137) ([andygrove](https://github.com/andygrove)) -- test: Expand tests for built-in functions [\#129](https://github.com/apache/arrow-datafusion-python/pull/129) ([simicd](https://github.com/simicd)) -- build\(deps\): bump object\_store from 0.5.2 to 0.5.3 [\#126](https://github.com/apache/arrow-datafusion-python/pull/126) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump mimalloc from 0.1.32 to 0.1.34 [\#125](https://github.com/apache/arrow-datafusion-python/pull/125) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Introduce conda directory containing datafusion-dev.yaml conda enviro… [\#124](https://github.com/apache/arrow-datafusion-python/pull/124) ([jdye64](https://github.com/jdye64)) -- build\(deps\): bump bzip2 from 0.4.3 to 0.4.4 [\#121](https://github.com/apache/arrow-datafusion-python/pull/121) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump tokio from 1.23.0 to 1.24.1 [\#119](https://github.com/apache/arrow-datafusion-python/pull/119) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump async-trait from 0.1.60 to 0.1.61 [\#118](https://github.com/apache/arrow-datafusion-python/pull/118) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Upgrade to DataFusion 16.0.0 [\#115](https://github.com/apache/arrow-datafusion-python/pull/115) ([andygrove](https://github.com/andygrove)) -- Bump async-trait from 0.1.57 to 0.1.60 [\#114](https://github.com/apache/arrow-datafusion-python/pull/114) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Bump object\_store from 0.5.1 to 0.5.2 [\#112](https://github.com/apache/arrow-datafusion-python/pull/112) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Bump tokio from 1.21.2 to 1.23.0 [\#109](https://github.com/apache/arrow-datafusion-python/pull/109) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Add entries for publishing production \(asf-site\) and staging docs [\#107](https://github.com/apache/arrow-datafusion-python/pull/107) ([martin-g](https://github.com/martin-g)) -- Add a workflow that builds the docs and deploys them at staged or production [\#104](https://github.com/apache/arrow-datafusion-python/pull/104) ([martin-g](https://github.com/martin-g)) -- Upgrade to DataFusion 15.0.0 [\#103](https://github.com/apache/arrow-datafusion-python/pull/103) ([andygrove](https://github.com/andygrove)) -- build\(deps\): bump futures from 0.3.24 to 0.3.25 [\#102](https://github.com/apache/arrow-datafusion-python/pull/102) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump pyo3 from 0.17.2 to 0.17.3 [\#101](https://github.com/apache/arrow-datafusion-python/pull/101) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump mimalloc from 0.1.30 to 0.1.32 [\#98](https://github.com/apache/arrow-datafusion-python/pull/98) ([dependabot[bot]](https://github.com/apps/dependabot)) -- build\(deps\): bump rand from 0.7.3 to 0.8.5 [\#97](https://github.com/apache/arrow-datafusion-python/pull/97) ([dependabot[bot]](https://github.com/apps/dependabot)) -- Fix GitHub actions warnings [\#95](https://github.com/apache/arrow-datafusion-python/pull/95) ([martin-g](https://github.com/martin-g)) -- Fixes \#81 - Add CI workflow for source distribution [\#93](https://github.com/apache/arrow-datafusion-python/pull/93) ([martin-g](https://github.com/martin-g)) -- post-release updates [\#91](https://github.com/apache/arrow-datafusion-python/pull/91) ([andygrove](https://github.com/andygrove)) -- Build for manylinux 2014 [\#88](https://github.com/apache/arrow-datafusion-python/pull/88) ([martin-g](https://github.com/martin-g)) -- update release readme tag [\#86](https://github.com/apache/arrow-datafusion-python/pull/86) ([Jimexist](https://github.com/Jimexist)) -- Upgrade Maturin to 0.14.2 [\#85](https://github.com/apache/arrow-datafusion-python/pull/85) ([martin-g](https://github.com/martin-g)) -- Update release instructions [\#83](https://github.com/apache/arrow-datafusion-python/pull/83) ([andygrove](https://github.com/andygrove)) -- \[Functions\] - Add python function binding to `functions` [\#73](https://github.com/apache/arrow-datafusion-python/pull/73) ([francis-du](https://github.com/francis-du)) - -## [0.8.0-rc1](https://github.com/apache/arrow-datafusion-python/tree/0.8.0-rc1) (2023-02-17) - -[Full Changelog](https://github.com/apache/arrow-datafusion-python/compare/0.7.0-rc2...0.8.0-rc1) - -**Implemented enhancements:** - -- Add bindings for datafusion\_common::DFField [\#184](https://github.com/apache/arrow-datafusion-python/issues/184) -- Add bindings for DFSchema/DFSchemaRef [\#181](https://github.com/apache/arrow-datafusion-python/issues/181) -- Add bindings for datafusion\_expr Projection [\#179](https://github.com/apache/arrow-datafusion-python/issues/179) -- Add bindings for `TableScan` struct from `datafusion_expr::TableScan` [\#177](https://github.com/apache/arrow-datafusion-python/issues/177) -- Add a "mapping" struct for types [\#172](https://github.com/apache/arrow-datafusion-python/issues/172) -- Improve string representation of datafusion classes \(dataframe, context, expression, ...\) [\#158](https://github.com/apache/arrow-datafusion-python/issues/158) -- Add DataFrame count method [\#151](https://github.com/apache/arrow-datafusion-python/issues/151) -- \[REQUEST\] Github Actions Improvements [\#146](https://github.com/apache/arrow-datafusion-python/issues/146) -- Change default branch name from master to main [\#144](https://github.com/apache/arrow-datafusion-python/issues/144) -- Bump pyo3 to 0.18.0 [\#140](https://github.com/apache/arrow-datafusion-python/issues/140) -- Add script for Python linting [\#134](https://github.com/apache/arrow-datafusion-python/issues/134) -- Add Python bindings for substrait module [\#132](https://github.com/apache/arrow-datafusion-python/issues/132) -- Expand unit tests for built-in functions [\#128](https://github.com/apache/arrow-datafusion-python/issues/128) -- support creating arrow-datafusion-python conda environment [\#122](https://github.com/apache/arrow-datafusion-python/issues/122) -- Build Python source distribution in GitHub workflow [\#81](https://github.com/apache/arrow-datafusion-python/issues/81) -- EPIC: Add all functions to python binding `functions` [\#72](https://github.com/apache/arrow-datafusion-python/issues/72) - -**Fixed bugs:** - -- Build is broken [\#161](https://github.com/apache/arrow-datafusion-python/issues/161) -- Out of memory when sorting [\#157](https://github.com/apache/arrow-datafusion-python/issues/157) -- window\_lead test appears to be non-deterministic [\#135](https://github.com/apache/arrow-datafusion-python/issues/135) -- Reading csv does not work [\#130](https://github.com/apache/arrow-datafusion-python/issues/130) -- Github actions produce a lot of warnings [\#94](https://github.com/apache/arrow-datafusion-python/issues/94) -- ASF source release tarball has wrong directory name [\#90](https://github.com/apache/arrow-datafusion-python/issues/90) -- Python Release Build failing after upgrading to maturin 14.2 [\#87](https://github.com/apache/arrow-datafusion-python/issues/87) -- Maturin build hangs on Linux ARM64 [\#84](https://github.com/apache/arrow-datafusion-python/issues/84) -- Cannot install on Mac M1 from source tarball from testpypi [\#82](https://github.com/apache/arrow-datafusion-python/issues/82) -- ImportPathMismatchError when running pytest locally [\#77](https://github.com/apache/arrow-datafusion-python/issues/77) - -**Closed issues:** - -- Publish documentation for Python bindings [\#39](https://github.com/apache/arrow-datafusion-python/issues/39) -- Add Python binding for `approx_median` [\#32](https://github.com/apache/arrow-datafusion-python/issues/32) -- Release version 0.7.0 [\#7](https://github.com/apache/arrow-datafusion-python/issues/7) - -## [0.7.0-rc2](https://github.com/apache/arrow-datafusion-python/tree/0.7.0-rc2) (2022-11-26) - -[Full Changelog](https://github.com/apache/arrow-datafusion-python/compare/0.7.0...0.7.0-rc2) - - -## [Unreleased](https://github.com/datafusion-contrib/datafusion-python/tree/HEAD) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1...HEAD) - -**Merged pull requests:** - -- use \_\_getitem\_\_ for df column selection [\#41](https://github.com/datafusion-contrib/datafusion-python/pull/41) ([Jimexist](https://github.com/Jimexist)) -- fix demo in readme [\#40](https://github.com/datafusion-contrib/datafusion-python/pull/40) ([Jimexist](https://github.com/Jimexist)) -- Implement select_columns [\#39](https://github.com/datafusion-contrib/datafusion-python/pull/39) ([andygrove](https://github.com/andygrove)) -- update readme and changelog [\#38](https://github.com/datafusion-contrib/datafusion-python/pull/38) ([Jimexist](https://github.com/Jimexist)) -- Add PyDataFrame.explain [\#36](https://github.com/datafusion-contrib/datafusion-python/pull/36) ([andygrove](https://github.com/andygrove)) -- Release 0.5.0 [\#34](https://github.com/datafusion-contrib/datafusion-python/pull/34) ([Jimexist](https://github.com/Jimexist)) -- disable nightly in workflow [\#33](https://github.com/datafusion-contrib/datafusion-python/pull/33) ([Jimexist](https://github.com/Jimexist)) -- update requirements to 37 and 310, update readme [\#32](https://github.com/datafusion-contrib/datafusion-python/pull/32) ([Jimexist](https://github.com/Jimexist)) -- Add custom global allocator [\#30](https://github.com/datafusion-contrib/datafusion-python/pull/30) ([matthewmturner](https://github.com/matthewmturner)) -- Remove pandas dependency [\#25](https://github.com/datafusion-contrib/datafusion-python/pull/25) ([matthewmturner](https://github.com/matthewmturner)) -- upgrade datafusion and pyo3 [\#20](https://github.com/datafusion-contrib/datafusion-python/pull/20) ([Jimexist](https://github.com/Jimexist)) -- update maturin 0.12+ [\#17](https://github.com/datafusion-contrib/datafusion-python/pull/17) ([Jimexist](https://github.com/Jimexist)) -- Update README.md [\#16](https://github.com/datafusion-contrib/datafusion-python/pull/16) ([Jimexist](https://github.com/Jimexist)) -- apply cargo clippy --fix [\#15](https://github.com/datafusion-contrib/datafusion-python/pull/15) ([Jimexist](https://github.com/Jimexist)) -- update test workflow to include rust clippy and check [\#14](https://github.com/datafusion-contrib/datafusion-python/pull/14) ([Jimexist](https://github.com/Jimexist)) -- use maturin 0.12.6 [\#13](https://github.com/datafusion-contrib/datafusion-python/pull/13) ([Jimexist](https://github.com/Jimexist)) -- apply cargo fmt [\#12](https://github.com/datafusion-contrib/datafusion-python/pull/12) ([Jimexist](https://github.com/Jimexist)) -- use stable not nightly [\#11](https://github.com/datafusion-contrib/datafusion-python/pull/11) ([Jimexist](https://github.com/Jimexist)) -- ci: test against more compilers, setup clippy and fix clippy lints [\#9](https://github.com/datafusion-contrib/datafusion-python/pull/9) ([cpcloud](https://github.com/cpcloud)) -- Fix use of importlib.metadata and unify requirements.txt [\#8](https://github.com/datafusion-contrib/datafusion-python/pull/8) ([cpcloud](https://github.com/cpcloud)) -- Ship the Cargo.lock file in the source distribution [\#7](https://github.com/datafusion-contrib/datafusion-python/pull/7) ([cpcloud](https://github.com/cpcloud)) -- add \_\_version\_\_ attribute to datafusion object [\#3](https://github.com/datafusion-contrib/datafusion-python/pull/3) ([tfeda](https://github.com/tfeda)) -- fix ci by fixing directories [\#2](https://github.com/datafusion-contrib/datafusion-python/pull/2) ([Jimexist](https://github.com/Jimexist)) -- setup workflow [\#1](https://github.com/datafusion-contrib/datafusion-python/pull/1) ([Jimexist](https://github.com/Jimexist)) - -## [0.5.1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1) (2022-03-15) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1-rc1...0.5.1) - -## [0.5.1-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1-rc1) (2022-03-15) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0...0.5.1-rc1) - -## [0.5.0](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0) (2022-03-10) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc2...0.5.0) - -## [0.5.0-rc2](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc2) (2022-03-10) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc1...0.5.0-rc2) - -**Closed issues:** - -- Add support for Ballista [\#37](https://github.com/datafusion-contrib/datafusion-python/issues/37) -- Implement DataFrame.explain [\#35](https://github.com/datafusion-contrib/datafusion-python/issues/35) - -## [0.5.0-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc1) (2022-03-09) - -[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/4c98b8e9c3c3f8e2e6a8f2d1ffcfefda344c4680...0.5.0-rc1) - -**Closed issues:** - -- Investigate exposing additional optimizations [\#28](https://github.com/datafusion-contrib/datafusion-python/issues/28) -- Use custom allocator in Python build [\#27](https://github.com/datafusion-contrib/datafusion-python/issues/27) -- Why is pandas a requirement? [\#24](https://github.com/datafusion-contrib/datafusion-python/issues/24) -- Unable to build [\#18](https://github.com/datafusion-contrib/datafusion-python/issues/18) -- Setup CI against multiple Python version [\#6](https://github.com/datafusion-contrib/datafusion-python/issues/6) - -\* _This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)_ - - -\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* +The changelogs have now moved [here](./dev/changelog). diff --git a/Cargo.lock b/Cargo.lock index 23f1486b4..40b1ba7f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,37 +1,80 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] -name = "adler" -version = "1.0.2" +name = "abi_stable" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" +dependencies = [ + "abi_stable_derive", + "abi_stable_shared", + "const_panic", + "core_extensions", + "crossbeam-channel", + "generational-arena", + "libloading", + "lock_api", + "parking_lot", + "paste", + "repr_offset", + "rustc_version", + "serde", + "serde_derive", + "serde_json", +] [[package]] -name = "adler32" -version = "1.2.0" +name = "abi_stable_derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" +dependencies = [ + "abi_stable_shared", + "as_derive_utils", + "core_extensions", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", + "typed-arena", +] + +[[package]] +name = "abi_stable_shared" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" +dependencies = [ + "core_extensions", +] + +[[package]] +name = "adler2" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.3.4", "once_cell", "version_check", + "zerocopy", ] [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" dependencies = [ "memchr", ] @@ -51,6 +94,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -62,56 +111,75 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.69" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" +checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" [[package]] name = "apache-avro" -version = "0.14.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cf4144857f9e4d7dd6cc4ba4c78efd2a46bad682b029bd0d91e76a021af1b2a" +checksum = "36fa98bc79671c7981272d91a8753a928ff6a1cd8e4f20a44c45bd5d313840bf" dependencies = [ - "byteorder", + "bigdecimal", + "bon", + "bzip2", "crc32fast", "digest", - "lazy_static", - "libflate", + "liblzma", "log", + "miniz_oxide", "num-bigint", "quad-rand", "rand", - "regex", + "regex-lite", "serde", + "serde_bytes", "serde_json", "snap", "strum", "strum_macros", "thiserror", - "typed-builder", "uuid", - "zerocopy", + "zstd", +] + +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + +[[package]] +name = "arc-swap" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9f3647c145568cec02c42054e07bdf9a5a698e15b466fb2341bfc393cd24aa5" +dependencies = [ + "rustversion", ] [[package]] name = "arrayref" -version = "0.3.6" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c527152e37cf757a3f78aae5a06fbeefdb07ccc535c980a3208ee3060dd544" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" [[package]] name = "arrayvec" -version = "0.7.2" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f410d3907b6b3647b9e7bca4551274b2e3d716aa940afb67b7287257401da921" +checksum = "602268ce9f569f282cedb9a9f6bac569b680af47b9b077d515900c03c5d190da" dependencies = [ - "ahash", "arrow-arith", "arrow-array", "arrow-buffer", @@ -121,121 +189,129 @@ dependencies = [ "arrow-ipc", "arrow-json", "arrow-ord", + "arrow-pyarrow", "arrow-row", "arrow-schema", "arrow-select", "arrow-string", - "comfy-table", - "pyo3", ] [[package]] name = "arrow-arith" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f87391cf46473c9bc53dab68cb8872c3a81d4dfd1703f1c8aa397dba9880a043" +checksum = "cd53c6bf277dea91f136ae8e3a5d7041b44b5e489e244e637d00ae302051f56f" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half", - "num", + "num-traits", ] [[package]] name = "arrow-array" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d35d5475e65c57cffba06d0022e3006b677515f99b54af33a7cd54f6cdd4a5b5" +checksum = "e53796e07a6525edaf7dc28b540d477a934aff14af97967ad1d5550878969b9e" dependencies = [ "ahash", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", + "chrono-tz", "half", - "hashbrown 0.13.2", - "num", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", ] [[package]] name = "arrow-buffer" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b4ec72eda7c0207727df96cf200f539749d736b21f3e782ece113e18c1a0a7" +checksum = "f2c1a85bb2e94ee10b76531d8bc3ce9b7b4c0d508cabfb17d477f63f2617bd20" dependencies = [ + "bytes", "half", - "num", + "num-bigint", + "num-traits", ] [[package]] name = "arrow-cast" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a7285272c9897321dfdba59de29f5b05aeafd3cdedf104a941256d155f6d304" +checksum = "89fb245db6b0e234ed8e15b644edb8664673fefe630575e94e62cd9d489a8a26" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", + "arrow-ord", "arrow-schema", "arrow-select", + "atoi", + "base64", "chrono", + "comfy-table", + "half", "lexical-core", - "num", + "num-traits", + "ryu", ] [[package]] name = "arrow-csv" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "981ee4e7f6a120da04e00d0b39182e1eeacccb59c8da74511de753c56b7fddf7" +checksum = "d374882fb465a194462527c0c15a93aa19a554cf690a6b77a26b2a02539937a7" dependencies = [ "arrow-array", - "arrow-buffer", "arrow-cast", - "arrow-data", "arrow-schema", "chrono", "csv", "csv-core", - "lazy_static", - "lexical-core", "regex", ] [[package]] name = "arrow-data" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27cc673ee6989ea6e4b4e8c7d461f7e06026a096c8f0b1a7288885ff71ae1e56" +checksum = "189d210bc4244c715fa3ed9e6e22864673cccb73d5da28c2723fb2e527329b33" dependencies = [ "arrow-buffer", "arrow-schema", "half", - "num", + "num-integer", + "num-traits", ] [[package]] name = "arrow-ipc" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e37b8b69d9e59116b6b538e8514e0ec63a30f08b617ce800d31cb44e3ef64c1a" +checksum = "7968c2e5210c41f4909b2ef76f6e05e172b99021c2def5edf3cc48fdd39d1d6c" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-schema", + "arrow-select", "flatbuffers", + "lz4_flex", + "zstd", ] [[package]] name = "arrow-json" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80c3fa0bed7cfebf6d18e46b733f9cb8a1cb43ce8e6539055ca3e1e48a426266" +checksum = "92111dba5bf900f443488e01f00d8c4ddc2f47f5c50039d18120287b580baa22" dependencies = [ "arrow-array", "arrow-buffer", @@ -245,134 +321,197 @@ dependencies = [ "chrono", "half", "indexmap", + "itoa", "lexical-core", - "num", + "memchr", + "num-traits", + "ryu", + "serde_core", "serde_json", + "simdutf8", ] [[package]] name = "arrow-ord" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d247dce7bed6a8d6a3c6debfa707a3a2f694383f0c692a39d736a593eae5ef94" +checksum = "211136cb253577ee1a6665f741a13136d4e563f64f5093ffd6fb837af90b9495" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "num", +] + +[[package]] +name = "arrow-pyarrow" +version = "58.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "205437da4c0877c756c81bfe847a621d0a740cd00a155109d65510a1a62ebcd9" +dependencies = [ + "arrow-array", + "arrow-data", + "arrow-schema", + "pyo3", ] [[package]] name = "arrow-row" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d609c0181f963cea5c70fddf9a388595b5be441f3aa1d1cdbf728ca834bbd3a" +checksum = "8e0f20145f9f5ea3fe383e2ba7a7487bf19be36aa9dbf5dd6a1f92f657179663" dependencies = [ - "ahash", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "half", - "hashbrown 0.13.2", ] [[package]] name = "arrow-schema" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64951898473bfb8e22293e83a44f02874d2257514d49cd95f9aa4afcff183fbc" +checksum = "1b47e0ca91cc438d2c7879fe95e0bca5329fff28649e30a88c6f760b1faeddcb" dependencies = [ "bitflags", + "serde_core", + "serde_json", ] [[package]] name = "arrow-select" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a513d89c2e1ac22b28380900036cf1f3992c6443efc5e079de631dcf83c6888" +checksum = "750a7d1dda177735f5e82a314485b6915c7cccdbb278262ac44090f4aba4a325" dependencies = [ + "ahash", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "num", + "num-traits", ] [[package]] name = "arrow-string" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5288979b2705dae1114c864d73150629add9153b9b8f1d7ee3963db94c372ba5" +checksum = "e1eab1208bc4fe55d768cdc9b9f3d9df5a794cdb3ee2586bf89f9b30dc31ad8c" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "memchr", + "num-traits", "regex", "regex-syntax", ] +[[package]] +name = "as_derive_utils" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" +dependencies = [ + "core_extensions", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "async-compression" -version = "0.3.15" +version = "0.4.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a" +checksum = "7d67d43201f4d20c78bcda740c142ca52482d81da80681533d33bf3f0596c8e2" dependencies = [ - "bzip2", - "flate2", - "futures-core", - "futures-io", - "memchr", + "compression-codecs", + "compression-core", "pin-project-lite", "tokio", - "xz2", - "zstd 0.11.2+zstd.1.5.2", - "zstd-safe 5.0.2+zstd.1.5.2", +] + +[[package]] +name = "async-ffi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" +dependencies = [ + "abi_stable", ] [[package]] name = "async-recursion" -version = "1.0.2" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b015a331cc64ebd1774ba119538573603427eaace0a1950c423ab971f903796" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "async-trait" -version = "0.1.66" +version = "0.1.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84f9ebcc6c1f5b8cb160f6990096a5c127f423fcb6e1ccc46c370cbdfb75dfc" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" -version = "1.1.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "base64" -version = "0.21.0" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bigdecimal" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", + "serde", +] [[package]] name = "bitflags" -version = "1.3.2" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "blake2" @@ -385,16 +524,16 @@ dependencies = [ [[package]] name = "blake3" -version = "1.3.3" +version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "digest", + "cpufeatures", ] [[package]] @@ -406,11 +545,36 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bon" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d13a61f2963b88eef9c1be03df65d42f6996dfeac1054870d950fcf66686f83" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d314cc62af2b6b0c65780555abb4d02a03dd3b799cd42419044f0c38d99738c0" +dependencies = [ + "darling", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.117", +] + [[package]] name = "brotli" -version = "3.3.4" +version = "8.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -419,9 +583,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "2.3.4" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b6561fd3f895a11e8f72af2cb7d22e08366bebc2b6b57f7744c4bda27034744" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -429,236 +593,306 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.12.0" +version = "3.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +checksum = "5c6f81257d10a0f602a294ae4182251151ff97dbb504ef9afcdda4a64b24d9b4" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.4.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "bzip2" -version = "0.4.4" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" dependencies = [ - "bzip2-sys", - "libc", + "libbz2-rs-sys", ] [[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" +name = "cc" +version = "1.2.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" dependencies = [ - "cc", + "find-msvc-tools", + "jobserver", "libc", - "pkg-config", + "shlex", ] [[package]] -name = "cc" -version = "1.0.79" +name = "cfg-if" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" -dependencies = [ - "jobserver", -] +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] -name = "cfg-if" -version = "1.0.0" +name = "cfg_aliases" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.24" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", - "js-sys", - "num-integer", "num-traits", "serde", - "time", - "wasm-bindgen", - "winapi", + "windows-link", ] [[package]] -name = "codespan-reporting" -version = "0.11.1" +name = "chrono-tz" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" dependencies = [ - "termcolor", - "unicode-width", + "chrono", + "phf", +] + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", ] [[package]] name = "comfy-table" -version = "6.1.4" +version = "7.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" dependencies = [ - "strum", - "strum_macros", + "unicode-segmentation", "unicode-width", ] +[[package]] +name = "compression-codecs" +version = "0.4.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" + [[package]] name = "const-random" -version = "0.1.15" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368a7a772ead6ce7e1de82bfb04c485f3db8ec744f72925af5735e29a22cc18e" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" dependencies = [ "const-random-macro", - "proc-macro-hack", ] [[package]] name = "const-random-macro" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d7d6ab3c3a2282db210df5f02c4dab6e0a7057af0fb7ebd4070f30fe05c0ddb" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.17", "once_cell", - "proc-macro-hack", "tiny-keccak", ] +[[package]] +name = "const_panic" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e262cdaac42494e3ae34c43969f9cdeb7da178bdb4b66fa6a1ea2edb4c8ae652" +dependencies = [ + "typewit", +] + [[package]] name = "constant_time_eq" -version = "0.2.5" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "core-foundation" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13418e745008f7349ec7e449155f419a61b92b58a99cc3616942b926825ec76b" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] [[package]] name = "core-foundation-sys" -version = "0.8.3" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "core_extensions" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42bb5e5d0269fd4f739ea6cedaf29c16d81c27a7ce7582008e90eb50dcd57003" +dependencies = [ + "core_extensions_proc_macros", +] + +[[package]] +name = "core_extensions_proc_macros" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +checksum = "533d38ecd2709b7608fb8e18e4504deb99e9a72879e6aa66373a76d8dc4259ea" [[package]] name = "cpufeatures" -version = "0.2.5" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d997bd5e24a5928dd43e46dc529867e207907fe0b239c3477d924f7f2ca320" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-common" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", "typenum", ] +[[package]] +name = "cstr" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68523903c8ae5aacfa32a0d9ae60cadeb764e1da14ee0d26b1f3089f13a54636" +dependencies = [ + "proc-macro2", + "quote", +] + [[package]] name = "csv" -version = "1.2.1" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b015497079b9a9d69c02ad25de6c0a6edef051ea6360a327d0bd05802ef64ad" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" dependencies = [ "csv-core", "itoa", "ryu", - "serde", + "serde_core", ] [[package]] name = "csv-core" -version = "0.1.10" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" dependencies = [ "memchr", ] [[package]] -name = "cxx" -version = "1.0.92" +name = "darling" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a140f260e6f3f79013b8bfc65e7ce630c9ab4388c6a89c71e07226f49487b72" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", + "darling_core", + "darling_macro", ] [[package]] -name = "cxx-build" -version = "1.0.92" +name = "darling_core" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da6383f459341ea689374bf0a42979739dc421874f112ff26f829b8040b8e613" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" dependencies = [ - "cc", - "codespan-reporting", - "once_cell", + "ident_case", "proc-macro2", "quote", - "scratch", - "syn", + "strsim", + "syn 2.0.117", ] [[package]] -name = "cxxbridge-flags" -version = "1.0.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90201c1a650e95ccff1c8c0bb5a343213bdd317c6e600a93075bca2eff54ec97" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.92" +name = "darling_macro" +version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b75aed41bb2e6367cae39e6326ef817a851db13c13e4f3263714ca3cfb8de56" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ - "proc-macro2", + "darling_core", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "dashmap" -version = "5.4.0" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", - "hashbrown 0.12.3", + "crossbeam-utils", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core", @@ -666,81 +900,309 @@ dependencies = [ [[package]] name = "datafusion" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c467c5802cb75ecb0acffa2121d8361a8903fef05b21fd1ca12a55797df8a75" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ - "ahash", - "apache-avro", "arrow", - "async-compression", + "arrow-schema", "async-trait", "bytes", "bzip2", "chrono", - "dashmap", + "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-avro", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", - "datafusion-row", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", "datafusion-sql", "flate2", "futures", - "glob", - "hashbrown 0.13.2", - "indexmap", "itertools", - "lazy_static", + "liblzma", "log", - "num-traits", - "num_cpus", "object_store", "parking_lot", "parquet", - "paste", - "percent-encoding", - "pin-project-lite", "rand", - "smallvec", + "regex", "sqlparser", "tempfile", "tokio", - "tokio-stream", - "tokio-util", "url", "uuid", - "xz2", - "zstd 0.12.3+zstd.1.5.2", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "itertools", + "log", + "object_store", ] [[package]] name = "datafusion-common" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5c60e0a92bae06c6a55c4618947ae0837f833929b8d52ade1d50cf8b5f3b381" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ + "ahash", "apache-avro", "arrow", + "arrow-ipc", "chrono", - "num_cpus", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "libc", + "log", "object_store", "parquet", - "pyo3", + "paste", + "recursive", "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools", + "liblzma", + "log", + "object_store", + "rand", + "tokio", + "tokio-util", + "url", + "zstd", +] + +[[package]] +name = "datafusion-datasource-arrow" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools", + "object_store", + "tokio", ] +[[package]] +name = "datafusion-datasource-avro" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "apache-avro", + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-datasource", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "num-traits", + "object_store", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "serde_json", + "tokio", + "tokio-stream", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "datafusion-session", + "futures", + "itertools", + "log", + "object_store", + "parking_lot", + "parquet", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" + [[package]] name = "datafusion-execution" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96eab4ef369c3972a4f99418ba8673aa0cf818aa4b892d44c35b473edce8e021" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ + "arrow", + "arrow-buffer", + "async-trait", + "chrono", "dashmap", "datafusion-common", "datafusion-expr", - "hashbrown 0.13.2", + "datafusion-physical-expr-common", + "futures", "log", "object_store", "parking_lot", @@ -751,60 +1213,91 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0533db37a619a045a4fd37dabc2d99a85f10127d7f100d75914c194e9e36ed7" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ - "ahash", "arrow", + "async-trait", + "chrono", "datafusion-common", - "log", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap", + "itertools", + "paste", + "recursive", + "serde_json", "sqlparser", ] [[package]] -name = "datafusion-optimizer" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dbb8f467d31c3efebd372854e35cf1ca1572f163d0f031e2c6f8d78b317a43b" +name = "datafusion-expr-common" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap", + "itertools", + "paste", +] + +[[package]] +name = "datafusion-ffi" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ + "abi_stable", "arrow", + "arrow-schema", + "async-ffi", "async-trait", - "chrono", + "datafusion-catalog", "datafusion-common", + "datafusion-datasource", + "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate-common", "datafusion-physical-expr", - "hashbrown 0.13.2", - "itertools", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto", + "datafusion-proto-common", + "datafusion-session", + "futures", "log", - "regex-syntax", + "prost", + "semver", + "tokio", ] [[package]] -name = "datafusion-physical-expr" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ceb003f6c0cdc25d2e9e92f8ee0322faca2a8f94f03d070b5ca15224c5503a3c" +name = "datafusion-functions" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ - "ahash", "arrow", "arrow-buffer", - "arrow-schema", + "base64", "blake2", "blake3", "chrono", + "chrono-tz", "datafusion-common", + "datafusion-doc", + "datafusion-execution", "datafusion-expr", - "datafusion-row", - "half", - "hashbrown 0.13.2", - "indexmap", + "datafusion-expr-common", + "datafusion-macros", + "hex", "itertools", - "lazy_static", + "log", "md-5", + "memchr", "num-traits", - "paste", - "petgraph", "rand", "regex", "sha2", @@ -813,75 +1306,371 @@ dependencies = [ ] [[package]] -name = "datafusion-python" -version = "20.0.0" +name = "datafusion-functions-aggregate" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "num-traits", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "hashbrown 0.16.1", + "itertools", + "itoa", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", "async-trait", - "datafusion", + "datafusion-catalog", "datafusion-common", "datafusion-expr", - "datafusion-optimizer", - "datafusion-sql", + "datafusion-physical-plan", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "datafusion-doc", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "datafusion-optimizer" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap", + "itertools", + "log", + "recursive", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "parking_lot", + "paste", + "petgraph", + "recursive", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "ahash", + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "parking_lot", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.16.1", + "indexmap", + "itertools", + "log", + "num-traits", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-table", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto-common", + "object_store", + "prost", + "rand", +] + +[[package]] +name = "datafusion-proto-common" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "datafusion-common", + "prost", +] + +[[package]] +name = "datafusion-pruning" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools", + "log", +] + +[[package]] +name = "datafusion-python" +version = "52.0.0" +dependencies = [ + "arrow", + "arrow-select", + "async-trait", + "cstr", + "datafusion", + "datafusion-ffi", + "datafusion-proto", "datafusion-substrait", "futures", + "log", "mimalloc", "object_store", "parking_lot", + "prost", + "prost-types", "pyo3", - "rand", - "regex-syntax", + "pyo3-async-runtimes", + "pyo3-build-config", + "pyo3-log", + "serde_json", "tokio", + "url", "uuid", ] [[package]] -name = "datafusion-row" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba0b4478a205b767f6d98e26730a604562155a8b3cf64fe3600367e741b0f129" +name = "datafusion-session" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ - "arrow", + "async-trait", "datafusion-common", - "paste", - "rand", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", ] [[package]] name = "datafusion-sql" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a07af4200a9353ebe7bf0d6024894275cdebb303bb534b3a38eb0f532583083d" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ - "arrow-schema", + "arrow", + "bigdecimal", + "chrono", "datafusion-common", "datafusion-expr", + "datafusion-functions-nested", + "indexmap", "log", + "recursive", + "regex", "sqlparser", ] [[package]] name = "datafusion-substrait" -version = "20.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71a9cc3e51c77ada10990e996c15c5bda293b3e4a508f55c2f3993ea6c6a39a7" +version = "53.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=35749607f585b3bf25b66b7d2289c56c18d03e4f#35749607f585b3bf25b66b7d2289c56c18d03e4f" dependencies = [ "async-recursion", + "async-trait", "chrono", "datafusion", + "half", "itertools", "object_store", - "prost 0.11.8", - "prost-build 0.9.0", - "prost-types 0.11.8", + "pbjson-types", + "prost", "substrait", "tokio", + "url", ] [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", @@ -889,73 +1678,67 @@ dependencies = [ ] [[package]] -name = "doc-comment" -version = "0.3.3" +name = "displaydoc" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] [[package]] name = "dyn-clone" -version = "1.0.11" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b0cf012f1230e43cd00ebb729c6bb58707ecfa8ad08b52ef3a4ccd2697fc30" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] name = "either" -version = "1.8.1" +version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] -name = "encoding_rs" -version = "0.8.32" +name = "equivalent" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394" -dependencies = [ - "cfg-if", -] +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.2.8" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ - "errno-dragonfly", "libc", - "winapi", + "windows-sys 0.61.2", ] [[package]] -name = "errno-dragonfly" -version = "0.1.2" +name = "fastrand" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", - "libc", -] +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] -name = "fastrand" -version = "1.9.0" +name = "find-msvc-tools" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "fixedbitset" -version = "0.4.2" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" [[package]] name = "flatbuffers" -version = "23.1.21" +version = "25.12.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77f5399c2c9c50ae9418e522842ad362f61ee48b346ac106807bd355a8a7c619" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" dependencies = [ "bitflags", "rustc_version", @@ -963,12 +1746,13 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.25" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", + "zlib-rs", ] [[package]] @@ -977,20 +1761,32 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" -version = "1.1.0" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" dependencies = [ "percent-encoding", ] [[package]] name = "futures" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "531ac96c6ff5fd7c62263c5e3c67a603af4fcaee2e1a0ae5565ba3a11e69e549" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" dependencies = [ "futures-channel", "futures-core", @@ -1003,9 +1799,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "164713a5a0dcc3e7b4b1ed7d3b433cabc18025386f9339346e8daf15963cf7ac" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", "futures-sink", @@ -1013,15 +1809,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86d7a0c1aa76363dac491de0ee99faf6941128376f1cf96f07db7603b7de69dd" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" [[package]] name = "futures-executor" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1997dd9df74cdac935c76252744c1ed5794fac083242ea4fe77ef3ed60ba0f83" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" dependencies = [ "futures-core", "futures-task", @@ -1030,38 +1826,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d422fa3cbe3b40dca574ab087abb5bc98258ea57eea3fd6f1fa7162c778b91" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" [[package]] name = "futures-macro" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eb14ed937631bd8b8b8977f2c198443447a8355b6e3ca599f38c975e5a963b6" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "futures-sink" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec93083a4aecafb2a80a885c9de1f0ccae9dbd32c2bb54b0c3a65690e0b8d2f2" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "futures-task" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd65540d33b37b16542a0438c12e6aeead10d4ac5d05bd3f805b8f35ab592879" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" [[package]] name = "futures-util" -version = "0.3.27" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ef6b17e481503ec85211fed8f39d1970f128935ca1f814cd32ac4a6842e84ab" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-channel", "futures-core", @@ -1071,15 +1867,23 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", - "pin-utils", "slab", ] +[[package]] +name = "generational-arena" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" +dependencies = [ + "cfg-if", +] + [[package]] name = "generic-array" -version = "0.14.6" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", @@ -1087,32 +1891,61 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "r-efi", + "wasip2", + "wasip3", ] [[package]] name = "glob" -version = "0.3.1" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "h2" -version = "0.3.16" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5be7b54589b581f624f566bf5d8eb2bab1db736c51528720b6bd36b96b55924d" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" dependencies = [ + "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "futures-util", "http", "indexmap", "slab", @@ -1123,258 +1956,379 @@ dependencies = [ [[package]] name = "half" -version = "2.2.1" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b4af3693f1b705df946e9fe5631932443781d0aabb423b62fcd4d73f6d2fd0" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ + "cfg-if", "crunchy", "num-traits", + "zerocopy", ] [[package]] name = "hashbrown" -version = "0.12.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "hashbrown" -version = "0.13.2" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "ahash", + "foldhash 0.1.5", ] [[package]] -name = "heck" -version = "0.3.3" +name = "hashbrown" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ - "unicode-segmentation", + "allocator-api2", + "equivalent", + "foldhash 0.2.0", ] [[package]] name = "heck" -version = "0.4.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "hermit-abi" -version = "0.2.6" +name = "hex" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] -name = "home" -version = "0.5.4" +name = "http" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747309b4b440c06d57b0b25f2aee03ee9b5e5397d288c60e21fc709bb98a7408" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" dependencies = [ - "winapi", + "bytes", + "itoa", ] [[package]] -name = "http" -version = "0.2.9" +name = "http-body" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd6effc99afb63425aff9b05836f029929e345a6148a14b7ecd5ab67af944482" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "fnv", - "itoa", + "http", ] [[package]] -name = "http-body" -version = "0.4.5" +name = "http-body-util" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", + "futures-core", "http", + "http-body", "pin-project-lite", ] [[package]] name = "httparse" -version = "1.8.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] -name = "httpdate" -version = "1.0.2" +name = "humantime" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" [[package]] name = "hyper" -version = "0.14.25" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc5e554ff619822309ffd57d8734d77cd5ce6238bc956f037ea06c58238c9899" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" dependencies = [ + "atomic-waker", "bytes", "futures-channel", "futures-core", - "futures-util", "h2", "http", "http-body", "httparse", - "httpdate", "itoa", "pin-project-lite", - "socket2", + "pin-utils", + "smallvec", "tokio", - "tower-service", - "tracing", "want", ] [[package]] name = "hyper-rustls" -version = "0.23.2" +version = "0.27.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ "http", "hyper", + "hyper-util", "rustls", + "rustls-native-certs", + "rustls-pki-types", "tokio", "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", ] [[package]] name = "iana-time-zone" -version = "0.1.53" +version = "0.1.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", + "log", "wasm-bindgen", - "winapi", + "windows-core", ] [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] -name = "idna" -version = "0.3.0" +name = "icu_collections" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", ] [[package]] -name = "indexmap" -version = "1.9.2" +name = "icu_locale_core" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" dependencies = [ - "autocfg", - "hashbrown 0.12.3", + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", ] [[package]] -name = "indoc" -version = "1.0.9" +name = "icu_normalizer_data" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" [[package]] -name = "instant" -version = "0.1.12" +name = "icu_properties" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" dependencies = [ - "cfg-if", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", ] [[package]] -name = "integer-encoding" -version = "3.0.4" +name = "icu_properties_data" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" [[package]] -name = "io-lifetimes" -version = "1.0.6" +name = "icu_provider" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfa919a82ea574332e2de6e74b4c36e74d41982b335080fa59d4ef31be20fdf3" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" dependencies = [ - "libc", - "windows-sys 0.45.0", + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "ipnet" -version = "2.7.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] [[package]] name = "itertools" -version = "0.10.5" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" dependencies = [ "either", ] [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ + "getrandom 0.3.4", "libc", ] [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ + "once_cell", "wasm-bindgen", ] [[package]] -name = "lazy_static" -version = "1.4.0" +name = "leb128fmt" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "lexical-core" -version = "0.8.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1385,363 +2339,312 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" dependencies = [ "lexical-parse-integer", "lexical-util", - "static_assertions", ] [[package]] name = "lexical-parse-integer" -version = "0.8.6" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] name = "lexical-util" -version = "0.8.5" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" -dependencies = [ - "static_assertions", -] +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" [[package]] name = "lexical-write-float" -version = "0.8.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ "lexical-util", "lexical-write-integer", - "static_assertions", ] [[package]] name = "lexical-write-integer" -version = "0.8.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] -name = "libc" -version = "0.2.140" +name = "libbz2-rs-sys" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] -name = "libflate" -version = "1.3.0" +name = "libc" +version = "0.2.182" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97822bf791bd4d5b403713886a5fbe8bf49520fe78e323b0dc480ca1a03e50b0" -dependencies = [ - "adler32", - "crc32fast", - "libflate_lz77", -] +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" [[package]] -name = "libflate_lz77" -version = "1.2.0" +name = "libloading" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a52d3a8bfc85f250440e4424db7d857e241a3aebbbe301f3eb606ab15c39acbf" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" dependencies = [ - "rle-decode-fast", + "cfg-if", + "winapi", ] [[package]] -name = "libm" -version = "0.2.6" +name = "liblzma" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" +dependencies = [ + "liblzma-sys", +] [[package]] -name = "libmimalloc-sys" -version = "0.1.30" +name = "liblzma-sys" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8c7cbf8b89019683667e347572e6d55a7df7ea36b0c4ce69961b0cde67b174" +checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" dependencies = [ "cc", "libc", + "pkg-config", ] [[package]] -name = "link-cplusplus" -version = "1.0.8" +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "libmimalloc-sys" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +checksum = "667f4fec20f29dfc6bc7357c582d91796c169ad7e2fce709468aefeb2c099870" dependencies = [ "cc", + "libc", ] [[package]] name = "linux-raw-sys" -version = "0.1.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] -name = "lock_api" -version = "0.4.9" +name = "litemap" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" -dependencies = [ - "autocfg", - "scopeguard", -] +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" [[package]] -name = "log" -version = "0.4.17" +name = "lock_api" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" dependencies = [ - "cfg-if", + "scopeguard", ] [[package]] -name = "lz4" -version = "1.24.0" +name = "log" +version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" -dependencies = [ - "libc", - "lz4-sys", -] +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] -name = "lz4-sys" -version = "1.9.4" +name = "lru-slab" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" -dependencies = [ - "cc", - "libc", -] +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] -name = "lzma-sys" -version = "0.1.20" +name = "lz4_flex" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" dependencies = [ - "cc", - "libc", - "pkg-config", + "twox-hash", ] [[package]] name = "md-5" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" dependencies = [ + "cfg-if", "digest", ] [[package]] name = "memchr" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" - -[[package]] -name = "memoffset" -version = "0.8.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" -dependencies = [ - "autocfg", -] +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "mimalloc" -version = "0.1.34" +version = "0.1.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcb174b18635f7561a0c6c9fc2ce57218ac7523cf72c50af80e2d79ab8f3ba1" +checksum = "e1ee66a4b64c74f4ef288bcbb9192ad9c3feaad75193129ac8509af543894fd8" dependencies = [ "libmimalloc-sys", ] -[[package]] -name = "mime" -version = "0.3.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" - [[package]] name = "miniz_oxide" -version = "0.6.2" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ - "adler", + "adler2", + "simd-adler32", ] [[package]] name = "mio" -version = "0.8.6" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "log", - "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.45.0", + "wasi", + "windows-sys 0.61.2", ] [[package]] name = "multimap" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" - -[[package]] -name = "num" -version = "0.4.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" -dependencies = [ - "num-bigint", - "num-complex", - "num-integer", - "num-iter", - "num-rational", - "num-traits", -] +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" [[package]] name = "num-bigint" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", + "serde", ] [[package]] name = "num-complex" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "num-traits", ] [[package]] name = "num-integer" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" -dependencies = [ - "autocfg", - "num-traits", -] - -[[package]] -name = "num-iter" -version = "0.1.43" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" -dependencies = [ - "autocfg", - "num-integer", - "num-traits", -] - -[[package]] -name = "num-rational" -version = "0.4.1" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", - "num-bigint", - "num-integer", "num-traits", ] [[package]] name = "num-traits" -version = "0.2.15" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", "libm", ] [[package]] -name = "num_cpus" -version = "1.15.0" +name = "object" +version = "0.37.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" dependencies = [ - "hermit-abi", - "libc", + "memchr", ] [[package]] name = "object_store" -version = "0.5.5" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1ea8f683b4f89a64181393742c041520a1a87e9775e6b4c0dd5a3281af05fc6" +checksum = "c2858065e55c148d294a9f3aae3b0fa9458edadb41a108397094566f4e3c0dfb" dependencies = [ "async-trait", "base64", "bytes", "chrono", + "form_urlencoded", "futures", + "http", + "http-body-util", + "httparse", + "humantime", + "hyper", "itertools", + "md-5", "parking_lot", "percent-encoding", "quick-xml", "rand", "reqwest", "ring", - "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", - "snafu", + "serde_urlencoded", + "thiserror", "tokio", "tracing", "url", "walkdir", + "wasm-bindgen-futures", + "web-time", ] [[package]] name = "once_cell" -version = "1.17.1" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "openssl-probe" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "ordered-float" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" dependencies = [ "num-traits", ] [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" dependencies = [ "lock_api", "parking_lot_core", @@ -1749,27 +2652,26 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.7" +version = "0.9.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-sys 0.45.0", + "windows-link", ] [[package]] name = "parquet" -version = "34.0.0" +version = "58.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ac135ecf63ebb5f53dda0921b0b76d6048b3ef631a5f4760b9e8f863ff00cfa" +checksum = "3f491d0ef1b510194426ee67ddc18a9b747ef3c42050c19322a2cd2e1666c29b" dependencies = [ "ahash", "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-ipc", "arrow-schema", @@ -1780,56 +2682,107 @@ dependencies = [ "chrono", "flate2", "futures", - "hashbrown 0.13.2", - "lz4", - "num", + "half", + "hashbrown 0.16.1", + "lz4_flex", "num-bigint", + "num-integer", + "num-traits", + "object_store", "paste", "seq-macro", + "simdutf8", "snap", "thrift", "tokio", "twox-hash", - "zstd 0.12.3+zstd.1.5.2", + "zstd", ] [[package]] name = "paste" -version = "1.0.12" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] -name = "percent-encoding" -version = "2.2.0" +name = "pbjson" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898bac3fa00d0ba57a4e8289837e965baa2dee8c3749f3b11d45a64b4223d9c3" +dependencies = [ + "base64", + "serde", +] + +[[package]] +name = "pbjson-build" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +checksum = "af22d08a625a2213a78dbb0ffa253318c5c79ce3133d32d296655a7bdfb02095" +dependencies = [ + "heck", + "itertools", + "prost", + "prost-types", +] [[package]] -name = "pest" -version = "2.5.6" +name = "pbjson-types" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cbd939b234e95d72bc393d51788aec68aeeb5d51e748ca08ff3aad58cb722f7" +checksum = "8e748e28374f10a330ee3bb9f29b828c0ac79831a32bab65015ad9b661ead526" dependencies = [ - "thiserror", - "ucd-trie", + "bytes", + "chrono", + "pbjson", + "pbjson-build", + "prost", + "prost-build", + "serde", ] +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + [[package]] name = "petgraph" -version = "0.6.3" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", + "hashbrown 0.15.5", "indexmap", + "serde", +] + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", ] [[package]] name = "pin-project-lite" -version = "0.2.9" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -1839,237 +2792,307 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.26" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] -name = "ppv-lite86" -version = "0.2.17" +name = "portable-atomic" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] -name = "proc-macro-hack" -version = "0.5.20+deprecated" +name = "potential_utf" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] [[package]] -name = "proc-macro2" -version = "1.0.52" +name = "ppv-lite86" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d0e1ae9e836cc3beddd63db0df682593d7e2d3d891ae8c9083d2113e1744224" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "unicode-ident", + "zerocopy", ] [[package]] -name = "prost" -version = "0.9.0" +name = "prettyplease" +version = "0.2.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ - "bytes", - "prost-derive 0.9.0", + "proc-macro2", + "syn 2.0.117", ] [[package]] -name = "prost" -version = "0.11.8" +name = "proc-macro2" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e48e50df39172a3e7eb17e14642445da64996989bc212b583015435d39a58537" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ - "bytes", - "prost-derive 0.11.8", + "unicode-ident", ] [[package]] -name = "prost-build" -version = "0.9.0" +name = "prost" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" dependencies = [ "bytes", - "heck 0.3.3", - "itertools", - "lazy_static", - "log", - "multimap", - "petgraph", - "prost 0.9.0", - "prost-types 0.9.0", - "regex", - "tempfile", - "which", + "prost-derive", ] [[package]] name = "prost-build" -version = "0.11.8" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c828f93f5ca4826f97fedcbd3f9a536c16b12cff3dbbb4a007f932bbad95b12" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ - "bytes", - "heck 0.4.1", + "heck", "itertools", - "lazy_static", "log", "multimap", "petgraph", - "prost 0.11.8", - "prost-types 0.11.8", + "prettyplease", + "prost", + "prost-types", "regex", + "syn 2.0.117", "tempfile", - "which", ] [[package]] name = "prost-derive" -version = "0.9.0" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", "itertools", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] -name = "prost-derive" -version = "0.11.8" +name = "prost-types" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ea9b0f8cbe5e15a8a042d030bd96668db28ecb567ec37d691971ff5731d2b1b" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" dependencies = [ - "anyhow", - "itertools", - "proc-macro2", - "quote", - "syn", + "prost", ] [[package]] -name = "prost-types" -version = "0.9.0" +name = "protobuf-src" +version = "2.1.1+27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" +checksum = "6217c3504da19b85a3a4b2e9a5183d635822d83507ba0986624b5c05b83bfc40" dependencies = [ - "bytes", - "prost 0.9.0", + "cmake", ] [[package]] -name = "prost-types" -version = "0.11.8" +name = "psm" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "379119666929a1afd7a043aa6cf96fa67a6dce9af60c88095a4686dbce4c9c88" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" dependencies = [ - "prost 0.11.8", + "ar_archive_writer", + "cc", ] [[package]] name = "pyo3" -version = "0.18.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06a3d8e8a46ab2738109347433cb7b96dffda2e4a218b03ef27090238886b147" +checksum = "cf85e27e86080aafd5a22eae58a162e133a589551542b3e5cee4beb27e54f8e1" dependencies = [ - "cfg-if", - "indoc", "libc", - "memoffset", - "parking_lot", + "once_cell", + "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", - "unindent", ] [[package]] -name = "pyo3-build-config" -version = "0.18.1" +name = "pyo3-async-runtimes" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75439f995d07ddfad42b192dfcf3bc66a7ecfd8b4a1f5f6f046aa5c2c5d7677d" +checksum = "9e7364a95bf00e8377bbf9b0f09d7ff9715a29d8fcf93b47d1a967363b973178" dependencies = [ + "futures-channel", + "futures-util", "once_cell", + "pin-project-lite", + "pyo3", + "tokio", +] + +[[package]] +name = "pyo3-build-config" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf94ee265674bf76c09fa430b0e99c26e319c945d96ca0d5a8215f31bf81cf7" +dependencies = [ "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.18.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "839526a5c07a17ff44823679b68add4a58004de00512a95b6c1c98a6dcac0ee5" +checksum = "491aa5fc66d8059dd44a75f4580a2962c1862a1c2945359db36f6c2818b748dc" dependencies = [ "libc", "pyo3-build-config", ] +[[package]] +name = "pyo3-log" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26c2ec80932c5c3b2d4fbc578c9b56b2d4502098587edb8bef5b6bfcad43682e" +dependencies = [ + "arc-swap", + "log", + "pyo3", +] + [[package]] name = "pyo3-macros" -version = "0.18.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd44cf207476c6a9760c4653559be4f206efafb924d3e4cbf2721475fc0d6cc5" +checksum = "f5d671734e9d7a43449f8480f8b38115df67bef8d21f76837fa75ee7aaa5e52e" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "pyo3-macros-backend" -version = "0.18.1" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc1f43d8e30460f36350d18631ccf85ded64c059829208fe680904c65bcd0a4c" +checksum = "22faaa1ce6c430a1f71658760497291065e6450d7b5dc2bcf254d49f66ee700a" dependencies = [ + "heck", "proc-macro2", + "pyo3-build-config", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "quad-rand" -version = "0.2.1" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658fa1faf7a4cc5f057c9ee5ef560f717ad9d8dc66d975267f709624d6e1ab88" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.27.1" +version = "0.38.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffc053f057dd768a56f62cd7e434c42c831d296968997e9ac1f76ea7c2d14c41" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" dependencies = [ "memchr", "serde", ] +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + [[package]] name = "quote" -version = "1.0.26" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "rand" -version = "0.8.5" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ - "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" -version = "0.3.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", "rand_core", @@ -2077,178 +3100,238 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.4" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ - "getrandom", + "quote", + "syn 2.0.117", ] [[package]] name = "redox_syscall" -version = "0.2.16" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ - "bitflags", + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", ] [[package]] -name = "regex" -version = "1.7.1" +name = "regex-automata" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" [[package]] name = "regress" -version = "0.4.1" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a92ff21fe8026ce3f2627faaf43606f0b67b014dbc9ccf027181a804f75d92e" +checksum = "2057b2325e68a893284d1538021ab90279adac1139957ca2a74426c6f118fb48" dependencies = [ + "hashbrown 0.16.1", "memchr", ] +[[package]] +name = "repr_offset" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" +dependencies = [ + "tstr", +] + [[package]] name = "reqwest" -version = "0.11.14" +version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21eed90ec8570952d53b772ecf8f206aa1ec9a3d76b2521c56c42973f2d91ee9" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ "base64", "bytes", - "encoding_rs", "futures-core", "futures-util", "h2", "http", "http-body", + "http-body-util", "hyper", "hyper-rustls", - "ipnet", + "hyper-util", "js-sys", "log", - "mime", - "once_cell", "percent-encoding", "pin-project-lite", + "quinn", "rustls", - "rustls-pemfile", + "rustls-native-certs", + "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", + "sync_wrapper", "tokio", "tokio-rustls", "tokio-util", + "tower", + "tower-http", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots", - "winreg", ] [[package]] name = "ring" -version = "0.16.20" +version = "0.17.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", + "cfg-if", + "getrandom 0.2.17", "libc", - "once_cell", - "spin", "untrusted", - "web-sys", - "winapi", + "windows-sys 0.52.0", ] [[package]] -name = "rle-decode-fast" -version = "1.0.3" +name = "rustc-hash" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver 1.0.17", -] - -[[package]] -name = "rustfmt-wrapper" -version = "0.2.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed729e3bee08ec2befd593c27e90ca9fdd25efdc83c94c3b82eaef16e4f7406e" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "serde", - "tempfile", - "thiserror", - "toml", - "toolchain_find", + "semver", ] [[package]] name = "rustix" -version = "0.36.9" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" dependencies = [ "bitflags", "errno", - "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.45.0", + "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.20.8" +version = "0.23.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ - "log", + "once_cell", "ring", - "sct", - "webpki", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", ] [[package]] -name = "rustls-pemfile" -version = "1.0.2" +name = "rustls-native-certs" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ - "base64", + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", ] [[package]] name = "rustversion" -version = "1.0.12" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.13" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "same-file" @@ -2259,11 +3342,20 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "schemars" -version = "0.8.12" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02c613288622e5f0c3fdc5dbd4db1c5fbe752746b1d1a56a0630b78fd00de44f" +checksum = "3fbf2ae1b8bc8e02df939598064d22402220cd5bbcca1c76f7d6a310974d5615" dependencies = [ "dyn-clone", "schemars_derive", @@ -2273,119 +3365,135 @@ dependencies = [ [[package]] name = "schemars_derive" -version = "0.8.12" +version = "0.8.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "109da1e6b197438deb6db99952990c7f959572794b80ff93707d55a232545e7c" +checksum = "32e265784ad618884abaea0600a9adf15393368d840e0222d101a072f3f7534d" dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn", + "syn 2.0.117", ] [[package]] name = "scopeguard" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] -name = "scratch" -version = "1.0.5" +name = "security-framework" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" +checksum = "d17b898a6d6948c3a8ee4372c17cb384f90d2e6e912ef00895b14fd7ab54ec38" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] [[package]] -name = "sct" -version = "0.7.0" +name = "security-framework-sys" +version = "2.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +checksum = "321c8673b092a9a42605034a9879d73cb79101ed5fd117bc9a597b89b4e9e61a" dependencies = [ - "ring", - "untrusted", + "core-foundation-sys", + "libc", ] [[package]] name = "semver" -version = "0.11.0" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" dependencies = [ - "semver-parser", + "serde", + "serde_core", ] [[package]] -name = "semver" -version = "1.0.17" +name = "seq-macro" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" [[package]] -name = "semver-parser" -version = "0.10.2" +name = "serde" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ - "pest", + "serde_core", + "serde_derive", ] [[package]] -name = "seq-macro" -version = "0.3.3" +name = "serde_bytes" +version = "0.11.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" +checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" +dependencies = [ + "serde", + "serde_core", +] [[package]] -name = "serde" -version = "1.0.156" +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "314b5b092c0ade17c00142951e50ced110ec27cea304b1037c6969246c2469a4" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.156" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7e29c4601e36bcec74a223228dce795f4cd3616341a4af93520ca1a837c087d" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "serde_derive_internals" -version = "0.26.0" +version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85bf8229e7920a9f636479437026331ce11aa132b4dde37d121944a44d6e5f3c" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "serde_json" -version = "1.0.94" +version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ "itoa", - "ryu", + "memchr", "serde", + "serde_core", + "zmij", ] [[package]] name = "serde_tokenstream" -version = "0.1.7" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797ba1d80299b264f3aac68ab5d12e5825a561749db4df7cd7c8083900c5d4e9" +checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1" dependencies = [ "proc-macro2", + "quote", "serde", - "syn", + "syn 2.0.117", ] [[package]] @@ -2402,9 +3510,9 @@ dependencies = [ [[package]] name = "serde_yaml" -version = "0.9.19" +version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f82e6c8c047aa50a7328632d067bcae6ef38772a79e28daf32f735e0e4f3dd10" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ "indexmap", "itoa", @@ -2415,9 +3523,9 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.6" +version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", "cpufeatures", @@ -2425,133 +3533,153 @@ dependencies = [ ] [[package]] -name = "slab" -version = "0.4.8" +name = "shlex" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" -dependencies = [ - "autocfg", -] +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] -name = "smallvec" -version = "1.10.0" +name = "simd-adler32" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" [[package]] -name = "snafu" -version = "0.7.4" +name = "simdutf8" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0656e7e3ffb70f6c39b3c2a86332bb74aa3c679da781642590f3c1118c5045" -dependencies = [ - "doc-comment", - "snafu-derive", -] +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] -name = "snafu-derive" -version = "0.7.4" +name = "siphasher" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "475b3bbe5245c26f2d8a6f62d67c1f30eb9fffeccee721c45d162c3ebbdf81b2" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "syn", -] +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "snap" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e9f0ab6ef7eb7353d9119c170a436d1bf248eea575ac42d19d12f4e34130831" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.4.9" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" dependencies = [ "libc", - "winapi", + "windows-sys 0.60.2", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - [[package]] name = "sqlparser" -version = "0.32.0" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0366f270dbabb5cc2e4c88427dc4c08bba144f81e32fbd459a013f26a4d16aa0" +checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" dependencies = [ "log", + "recursive", "sqlparser_derive", ] [[package]] name = "sqlparser_derive" -version = "0.1.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55fe75cb4a364c7f7ae06c7dbbc8d84bddd85d6cdf9975963c3935bc1991761e" +checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] -name = "static_assertions" -version = "1.1.0" +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "stacker" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum" -version = "0.24.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" [[package]] name = "strum_macros" -version = "0.24.3" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", - "rustversion", - "syn", + "syn 2.0.117", ] [[package]] name = "substrait" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e977fc98d1e03cf99220bb6bb96f8838ffa5c1306a8c83c1b25aa20817eb6d0" -dependencies = [ - "heck 0.4.1", - "prost 0.11.8", - "prost-build 0.11.8", - "prost-types 0.11.8", +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62fc4b483a129b9772ccb9c3f7945a472112fdd9140da87f8a4e7f1d44e045d0" +dependencies = [ + "heck", + "pbjson", + "pbjson-build", + "pbjson-types", + "prettyplease", + "prost", + "prost-build", + "prost-types", + "protobuf-src", + "regress", "schemars", + "semver", "serde", "serde_json", "serde_yaml", + "syn 2.0.117", "typify", "walkdir", ] [[package]] name = "subtle" -version = "2.4.1" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" @@ -2565,51 +3693,73 @@ dependencies = [ ] [[package]] -name = "target-lexicon" -version = "0.12.6" +name = "syn" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae9980cab1db3fceee2f6c6f643d5d8de2997c58ee8d25fb0cc8a9e9e7348e5" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] [[package]] -name = "tempfile" -version = "3.4.0" +name = "sync_wrapper" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" dependencies = [ - "cfg-if", - "fastrand", - "redox_syscall", - "rustix", - "windows-sys 0.42.0", + "futures-core", ] [[package]] -name = "termcolor" -version = "1.2.0" +name = "synstructure" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ - "winapi-util", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + +[[package]] +name = "tempfile" +version = "3.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +dependencies = [ + "fastrand", + "getrandom 0.4.1", + "once_cell", + "rustix", + "windows-sys 0.61.2", ] [[package]] name = "thiserror" -version = "1.0.39" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.39" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2624,30 +3774,29 @@ dependencies = [ ] [[package]] -name = "time" -version = "0.1.45" +name = "tiny-keccak" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", + "crunchy", ] [[package]] -name = "tiny-keccak" -version = "2.0.2" +name = "tinystr" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" dependencies = [ - "crunchy", + "displaydoc", + "zerovec", ] [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ "tinyvec_macros", ] @@ -2660,105 +3809,116 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.26.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03201d01c3c27a29c8a5cee5b55a93ddae1ccf6f08f65365c2c918f8c1b76f64" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" dependencies = [ - "autocfg", "bytes", "libc", - "memchr", "mio", - "num_cpus", - "parking_lot", "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.45.0", + "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "1.8.2" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d266c00fde287f55d3f1c3e96c500c362a2b8c695076ec180f27918820bc6df8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "tokio-rustls" -version = "0.23.4" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ "rustls", "tokio", - "webpki", ] [[package]] name = "tokio-stream" -version = "0.1.12" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" dependencies = [ "futures-core", "pin-project-lite", "tokio", + "tokio-util", ] [[package]] name = "tokio-util" -version = "0.7.7" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", ] [[package]] -name = "toml" -version = "0.5.11" +name = "tower" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ - "serde", + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", ] [[package]] -name = "toolchain_find" -version = "0.2.0" +name = "tower-http" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e85654a10e7a07a47c6f19d93818f3f343e22927f2fa280c84f7c8042743413" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "home", - "lazy_static", - "regex", - "semver 0.11.0", - "walkdir", + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", ] +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ - "cfg-if", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -2766,62 +3926,74 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.23" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", ] [[package]] name = "try-lock" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] -name = "twox-hash" -version = "1.6.3" +name = "tstr" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" dependencies = [ - "cfg-if", - "static_assertions", + "tstr_proc_macros", ] [[package]] -name = "typed-builder" -version = "0.10.0" +name = "tstr_proc_macros" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89851716b67b937e393b3daa8423e67ddfc4bbbf1654bcf05488e95e0828db0c" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] +checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" [[package]] name = "typenum" -version = "1.16.0" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "typewit" +version = "1.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +checksum = "f8c1ae7cc0fdb8b842d65d127cb981574b0d2b249b74d1c7a2986863dc134f71" [[package]] name = "typify" -version = "0.0.10" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e8486352f3c946e69f983558cfc09b295250b01e01b381ec67a05a812d01d63" +checksum = "e6d5bcc6f62eb1fa8aa4098f39b29f93dcb914e17158b76c50360911257aa629" dependencies = [ "typify-impl", "typify-macro", @@ -2829,198 +4001,188 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.10" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7624d0b911df6e2bbf34a236f76281f93b294cdde1d4df1dbdb748e5a7fefa5" +checksum = "a1eb359f7ffa4f9ebe947fa11a1b2da054564502968db5f317b7e37693cb2240" dependencies = [ - "heck 0.4.1", + "heck", "log", "proc-macro2", "quote", "regress", - "rustfmt-wrapper", "schemars", + "semver", + "serde", "serde_json", - "syn", + "syn 2.0.117", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.0.10" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c42802aa033cee7650a4e1509ba7d5848a56f84be7c4b31e4385ee12445e942" +checksum = "911c32f3c8514b048c1b228361bebb5e6d73aeec01696e8cc0e82e2ffef8ab7a" dependencies = [ "proc-macro2", "quote", "schemars", + "semver", "serde", "serde_json", "serde_tokenstream", - "syn", + "syn 2.0.117", "typify-impl", ] -[[package]] -name = "ucd-trie" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e79c4d996edb816c91e4308506774452e55e95c3c9de07b6729e17e15a5ef81" - -[[package]] -name = "unicode-bidi" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524b68aca1d05e03fdf03fcdce2c6c94b6daf6d16861ddaa7e4f2b6638a9052c" - [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" - -[[package]] -name = "unicode-normalization" -version = "0.1.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" -dependencies = [ - "tinyvec", -] +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" -version = "1.10.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.10" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] -name = "unindent" -version = "0.1.11" +name = "unicode-xid" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" [[package]] name = "unsafe-libyaml" -version = "0.2.7" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad2024452afd3874bf539695e04af6732ba06517424dbf958fdb16a01f3bef6c" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" [[package]] name = "untrusted" -version = "0.7.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.3.1" +version = "2.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "uuid" -version = "1.3.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" +checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" dependencies = [ - "getrandom", - "serde", + "getrandom 0.4.1", + "js-sys", + "serde_core", + "wasm-bindgen", ] [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "walkdir" -version = "2.3.2" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" dependencies = [ "same-file", - "winapi", "winapi-util", ] [[package]] name = "want" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" dependencies = [ - "log", "try-lock", ] [[package]] name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" +version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" +name = "wasip2" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] [[package]] -name = "wasm-bindgen" -version = "0.2.84" +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ - "cfg-if", - "wasm-bindgen-macro", + "wit-bindgen", ] [[package]] -name = "wasm-bindgen-backend" -version = "0.2.84" +name = "wasm-bindgen" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ - "bumpalo", - "log", + "cfg-if", "once_cell", - "proc-macro2", - "quote", - "syn", + "rustversion", + "wasm-bindgen-macro", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3028,74 +4190,91 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ + "bumpalo", "proc-macro2", "quote", - "syn", - "wasm-bindgen-backend", + "syn 2.0.117", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] [[package]] -name = "wasm-streams" -version = "0.2.3" +name = "wasm-metadata" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bbae3363c08332cadccd13b67db371814cd214c2524020932f0804b8cf7c078" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ - "futures-util", - "js-sys", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", ] [[package]] -name = "web-sys" -version = "0.3.61" +name = "wasm-streams" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" dependencies = [ + "futures-util", "js-sys", "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", ] [[package]] -name = "webpki" -version = "0.22.0" +name = "wasmparser" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ - "ring", - "untrusted", + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", ] [[package]] -name = "webpki-roots" -version = "0.22.6" +name = "web-sys" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c71e40d7d2c34a5106301fb632274ca37242cd0c9d3e64dbece371a40a2d87" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ - "webpki", + "js-sys", + "wasm-bindgen", ] [[package]] -name = "which" -version = "4.4.0" +name = "web-time" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ - "either", - "libc", - "once_cell", + "js-sys", + "wasm-bindgen", ] [[package]] @@ -3116,11 +4295,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.5" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "winapi", + "windows-sys 0.61.2", ] [[package]] @@ -3129,171 +4308,463 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" -version = "0.42.0" +version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows-targets 0.53.5", ] [[package]] name = "windows-sys" -version = "0.45.0" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows-targets", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] name = "windows-targets" -version = "0.42.2" +version = "0.53.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.2" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_aarch64_msvc" -version = "0.42.2" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnu" -version = "0.42.2" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_i686_msvc" -version = "0.42.2" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnu" -version = "0.42.2" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.2" +version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" [[package]] name = "windows_x86_64_msvc" -version = "0.42.2" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -name = "winreg" -version = "0.10.1" +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" dependencies = [ - "winapi", + "wit-bindgen-rust-macro", ] [[package]] -name = "xz2" -version = "0.1.7" +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ - "lzma-sys", + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", ] [[package]] name = "zerocopy" -version = "0.6.1" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "332f188cc1bcf1fe1064b8c58d150f497e697f49774aa846f2dc949d9a25f236" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" dependencies = [ - "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.3.2" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6505e6815af7de1746a08f69c69606bb45695a17149517680f3b2149713b19a3" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" +name = "zerofrom" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" dependencies = [ - "zstd-safe 5.0.2+zstd.1.5.2", + "zerofrom-derive", ] [[package]] -name = "zstd" -version = "0.12.3+zstd.1.5.2" +name = "zerofrom-derive" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ - "zstd-safe 6.0.4+zstd.1.5.4", + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", ] [[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" dependencies = [ - "libc", - "zstd-sys", + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "zlib-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c745c48e1007337ed136dc99df34128b9faa6ed542d80a1c673cf55a6d7236c8" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", ] [[package]] name = "zstd-safe" -version = "6.0.4+zstd.1.5.4" +version = "7.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7afb4b54b8910cf5447638cb54bf4e8a65cbedd783af98b98c62ffe91f185543" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" dependencies = [ - "libc", "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.7+zstd.1.5.4" +version = "2.0.16+zstd.1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94509c3ba2fe55294d752b79842c530ccfab760192521df74a081a78d2b3c7f5" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index 03dafefe8..b584470d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,44 +17,85 @@ [package] name = "datafusion-python" -version = "20.0.0" -homepage = "https://github.com/apache/arrow-datafusion-python" -repository = "https://github.com/apache/arrow-datafusion-python" -authors = ["Apache Arrow "] -description = "Apache Arrow DataFusion DataFrame and SQL Query Engine" +version = "52.0.0" +homepage = "https://datafusion.apache.org/python" +repository = "https://github.com/apache/datafusion-python" +authors = ["Apache DataFusion "] +description = "Apache DataFusion DataFrame and SQL Query Engine" readme = "README.md" license = "Apache-2.0" -edition = "2021" -rust-version = "1.64" +edition = "2024" +rust-version = "1.88" +include = [ + "/src", + "/datafusion", + "/LICENSE.txt", + "build.rs", + "pyproject.toml", + "Cargo.toml", + "Cargo.lock", +] [features] default = ["mimalloc"] +protoc = ["datafusion-substrait/protoc"] +substrait = ["dep:datafusion-substrait"] [dependencies] -tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync"] } -rand = "0.8" -pyo3 = { version = "0.18.1", features = ["extension-module", "abi3", "abi3-py37"] } -datafusion = { version = "20.0.0", features = ["pyarrow", "avro"]} -datafusion-common = { version = "20.0.0", features = ["pyarrow"]} -datafusion-expr = { version = "20.0.0" } -datafusion-optimizer = { version = "20.0.0" } -datafusion-sql = { version = "20.0.0" } -datafusion-substrait = { version = "20.0.0" } -uuid = { version = "1.2", features = ["v4"] } -mimalloc = { version = "*", optional = true, default-features = false } -async-trait = "0.1" +tokio = { version = "1.49", features = [ + "macros", + "rt", + "rt-multi-thread", + "sync", +] } +pyo3 = { version = "0.28", features = [ + "extension-module", + "abi3", + "abi3-py310", +] } +pyo3-async-runtimes = { version = "0.28", features = ["tokio-runtime"] } +pyo3-log = "0.13.3" +arrow = { version = "58", features = ["pyarrow"] } +arrow-select = { version = "58" } +datafusion = { version = "53", features = ["avro", "unicode_expressions"] } +datafusion-substrait = { version = "53", optional = true } +datafusion-proto = { version = "53" } +datafusion-ffi = { version = "53" } +prost = "0.14.3" # keep in line with `datafusion-substrait` +serde_json = "1" +uuid = { version = "1.21", features = ["v4"] } +mimalloc = { version = "0.1", optional = true, default-features = false, features = [ + "local_dynamic_tls", +] } +async-trait = "0.1.89" futures = "0.3" -object_store = { version = "0.5.3", features = ["aws", "gcp", "azure"] } +cstr = "0.2" +object_store = { version = "0.13.1", features = [ + "aws", + "gcp", + "azure", + "http", +] } +url = "2" +log = "0.4.29" parking_lot = "0.12" -regex-syntax = "0.6.28" + +[build-dependencies] +prost-types = "0.14.3" # keep in line with `datafusion-substrait` +pyo3-build-config = "0.28" [lib] name = "datafusion_python" crate-type = ["cdylib", "rlib"] -[package.metadata.maturin] -name = "datafusion._internal" - [profile.release] lto = true codegen-units = 1 + +# We cannot publish to crates.io with any patches in the below section. Developers +# must remove any entries in this section before creating a release candidate. +[patch.crates-io] +datafusion = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f" } +datafusion-substrait = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f" } +datafusion-proto = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f" } +datafusion-ffi = { git = "https://github.com/apache/datafusion.git", rev = "35749607f585b3bf25b66b7d2289c56c18d03e4f" } diff --git a/README.md b/README.md index 7c29defdc..810ac8710 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,19 @@ # DataFusion in Python -[![Python test](https://github.com/apache/arrow-datafusion-python/actions/workflows/test.yaml/badge.svg)](https://github.com/apache/arrow-datafusion-python/actions/workflows/test.yaml) -[![Python Release Build](https://github.com/apache/arrow-datafusion-python/actions/workflows/build.yml/badge.svg)](https://github.com/apache/arrow-datafusion-python/actions/workflows/build.yml) +[![Python test](https://github.com/apache/datafusion-python/actions/workflows/test.yaml/badge.svg)](https://github.com/apache/datafusion-python/actions/workflows/test.yaml) +[![Python Release Build](https://github.com/apache/datafusion-python/actions/workflows/build.yml/badge.svg)](https://github.com/apache/datafusion-python/actions/workflows/build.yml) -This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/arrow-datafusion). +This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/datafusion). -DataFusion's Python bindings can be used as an end-user tool as well as providing a foundation for building new systems. +DataFusion's Python bindings can be used as a foundation for building new data systems in Python. Here are some examples: + +- [Dask SQL](https://github.com/dask-contrib/dask-sql) uses DataFusion's Python bindings for SQL parsing, query + planning, and logical plan optimizations, and then transpiles the logical plan to Dask operations for execution. +- [DataFusion Ballista](https://github.com/apache/datafusion-ballista) is a distributed SQL query engine that extends + DataFusion's Python bindings for distributed use cases. +- [DataFusion Ray](https://github.com/apache/datafusion-ray) is another distributed query engine that uses + DataFusion's Python bindings. ## Features @@ -35,19 +42,9 @@ DataFusion's Python bindings can be used as an end-user tool as well as providin - Serialize and deserialize query plans in Substrait format. - Experimental support for transpiling SQL queries to DataFrame calls with Polars, Pandas, and cuDF. -## Comparison with other projects - -Here is a comparison with similar projects that may help understand when DataFusion might be suitable and unsuitable -for your needs: - -- [DuckDB](http://www.duckdb.org/) is an open source, in-process analytic database. Like DataFusion, it supports - very fast execution, both from its custom file format and directly from Parquet files. Unlike DataFusion, it is - written in C/C++ and it is primarily used directly by users as a serverless database and query system rather than - as a library for building such database systems. - -- [Polars](http://pola.rs/) is one of the fastest DataFrame libraries at the time of writing. Like DataFusion, it - is also written in Rust and uses the Apache Arrow memory model, but unlike DataFusion it does not provide full SQL - support, nor as many extension points. +For tips on tuning parallelism, see +[Maximizing CPU Usage](docs/source/user-guide/configuration.rst#maximizing-cpu-usage) +in the configuration guide. ## Example Usage @@ -86,37 +83,115 @@ This produces the following chart: ![Chart](examples/chart.png) +## Registering a DataFrame as a View + +You can use SessionContext's `register_view` method to convert a DataFrame into a view and register it with the context. + +```python +from datafusion import SessionContext, col, literal + +# Create a DataFusion context +ctx = SessionContext() + +# Create sample data +data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + +# Create a DataFrame from the dictionary +df = ctx.from_pydict(data, "my_table") + +# Filter the DataFrame (for example, keep rows where a > 2) +df_filtered = df.filter(col("a") > literal(2)) + +# Register the dataframe as a view with the context +ctx.register_view("view1", df_filtered) + +# Now run a SQL query against the registered view +df_view = ctx.sql("SELECT * FROM view1") + +# Collect the results +results = df_view.collect() + +# Convert results to a list of dictionaries for display +result_dicts = [batch.to_pydict() for batch in results] + +print(result_dicts) +``` + +This will output: + +```python +[{'a': [3, 4, 5], 'b': [30, 40, 50]}] +``` + +## Configuration + +It is possible to configure runtime (memory and disk settings) and configuration settings when creating a context. + +```python +runtime = ( + RuntimeEnvBuilder() + .with_disk_manager_os() + .with_fair_spill_pool(10000000) +) +config = ( + SessionConfig() + .with_create_default_catalog_and_schema(True) + .with_default_catalog_and_schema("foo", "bar") + .with_target_partitions(8) + .with_information_schema(True) + .with_repartition_joins(False) + .with_repartition_aggregations(False) + .with_repartition_windows(False) + .with_parquet_pruning(False) + .set("datafusion.execution.parquet.pushdown_filters", "true") +) +ctx = SessionContext(config, runtime) +``` + +Refer to the [API documentation](https://arrow.apache.org/datafusion-python/#api-reference) for more information. + +Printing the context will show the current configuration settings. + +```python +print(ctx) +``` + +## Extensions + +For information about how to extend DataFusion Python, please see the extensions page of the +[online documentation](https://datafusion.apache.org/python/). + ## More Examples See [examples](examples/README.md) for more information. ### Executing Queries with DataFusion -- [Query a Parquet file using SQL](./examples/sql-parquet.py) -- [Query a Parquet file using the DataFrame API](./examples/dataframe-parquet.py) -- [Run a SQL query and store the results in a Pandas DataFrame](./examples/sql-to-pandas.py) -- [Run a SQL query with a Python user-defined function (UDF)](./examples/sql-using-python-udf.py) -- [Run a SQL query with a Python user-defined aggregation function (UDAF)](./examples/sql-using-python-udaf.py) -- [Query PyArrow Data](./examples/query-pyarrow-data.py) -- [Create dataframe](./examples/import.py) -- [Export dataframe](./examples/export.py) +- [Query a Parquet file using SQL](https://github.com/apache/datafusion-python/blob/main/examples/sql-parquet.py) +- [Query a Parquet file using the DataFrame API](https://github.com/apache/datafusion-python/blob/main/examples/dataframe-parquet.py) +- [Run a SQL query and store the results in a Pandas DataFrame](https://github.com/apache/datafusion-python/blob/main/examples/sql-to-pandas.py) +- [Run a SQL query with a Python user-defined function (UDF)](https://github.com/apache/datafusion-python/blob/main/examples/sql-using-python-udf.py) +- [Run a SQL query with a Python user-defined aggregation function (UDAF)](https://github.com/apache/datafusion-python/blob/main/examples/sql-using-python-udaf.py) +- [Query PyArrow Data](https://github.com/apache/datafusion-python/blob/main/examples/query-pyarrow-data.py) +- [Create dataframe](https://github.com/apache/datafusion-python/blob/main/examples/import.py) +- [Export dataframe](https://github.com/apache/datafusion-python/blob/main/examples/export.py) ### Running User-Defined Python Code -- [Register a Python UDF with DataFusion](./examples/python-udf.py) -- [Register a Python UDAF with DataFusion](./examples/python-udaf.py) +- [Register a Python UDF with DataFusion](https://github.com/apache/datafusion-python/blob/main/examples/python-udf.py) +- [Register a Python UDAF with DataFusion](https://github.com/apache/datafusion-python/blob/main/examples/python-udaf.py) ### Substrait Support -- [Serialize query plans using Substrait](./examples/substrait.py) +- [Serialize query plans using Substrait](https://github.com/apache/datafusion-python/blob/main/examples/substrait.py) -### Executing SQL against DataFrame Libraries (Experimental) +## How to install -- [Executing SQL on Polars](./examples/sql-on-polars.py) -- [Executing SQL on Pandas](./examples/sql-on-pandas.py) -- [Executing SQL on cuDF](./examples/sql-on-cudf.py) +### uv -## How to install (from pip) +```bash +uv add datafusion +``` ### Pip @@ -142,73 +217,96 @@ You can verify the installation by running: ## How to develop -This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). +This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). The Maturin tools used in this workflow can be installed either via `uv` or `pip`. Both approaches should offer the same experience. It is recommended to use `uv` since it has significant performance improvements +over `pip`. + +Currently for protobuf support either [protobuf](https://protobuf.dev/installation/) or cmake must be installed. -The Maturin tools used in this workflow can be installed either via Conda or Pip. Both approaches should offer the same experience. Multiple approaches are only offered to appease developer preference. Bootstrapping for both Conda and Pip are as follows. +Bootstrap (`uv`): -Bootstrap (Conda): +By default `uv` will attempt to build the datafusion python package. For our development we prefer to build manually. This means +that when creating your virtual environment using `uv sync` you need to pass in the additional `--no-install-package datafusion` +and for `uv run` commands the additional parameter `--no-project` ```bash # fetch this repo -git clone git@github.com:apache/arrow-datafusion-python.git -# create the conda environment for dev -conda env create -f ./conda/environments/datafusion-dev.yaml -n datafusion-dev -# activate the conda environment -conda activate datafusion-dev +git clone git@github.com:apache/datafusion-python.git +# cd to the repo root +cd datafusion-python/ +# create the virtual environment +uv sync --dev --no-install-package datafusion +# activate the environment +source .venv/bin/activate ``` -Bootstrap (Pip): +Bootstrap (`pip`): ```bash # fetch this repo -git clone git@github.com:apache/arrow-datafusion-python.git +git clone git@github.com:apache/datafusion-python.git +# cd to the repo root +cd datafusion-python/ # prepare development environment (used to build wheel / install in development) -python3 -m venv venv +python3 -m venv .venv # activate the venv -source venv/bin/activate +source .venv/bin/activate # update pip itself if necessary python -m pip install -U pip -# install dependencies (for Python 3.8+) -python -m pip install -r requirements-310.txt +# install dependencies +python -m pip install -r pyproject.toml ``` The tests rely on test data in git submodules. ```bash -git submodule init -git submodule update +git submodule update --init ``` Whenever rust code changes (your changes or via `git pull`): ```bash # make sure you activate the venv using "source venv/bin/activate" first -maturin develop +maturin develop --uv python -m pytest ``` +Alternatively if you are using `uv` you can do the following without +needing to activate the virtual environment: + +```bash +uv run --no-project maturin develop --uv +uv run --no-project pytest . +``` + ### Running & Installing pre-commit hooks -arrow-datafusion-python takes advantage of (pre-commit)[https://pre-commit.com/] to assist developers in with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keep PRs clean and concise. +`datafusion-python` takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce +the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the +developer but certainly helpful for keeping PRs clean and concise. -Our pre-commit hooks can be installed by running `pre-commit install` which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit failing to perform the commit if an offending lint is found giving you the opportunity to make changes locally before pushing. +Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in +your DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete +the commit if an offending lint is found allowing you to make changes locally before pushing. -The pre-commit hooks can also be ran ad-hoc without installing them by simply running `pre-commit run --all-files` +The pre-commit hooks can also be run adhoc without installing them by simply running `pre-commit run --all-files`. -## How to update dependencies +NOTE: the current `pre-commit` hooks require docker, and cmake. See note on protobuf above. -To change test dependencies, change the `requirements.in` and run +## Running linters without using pre-commit -```bash -# install pip-tools (this can be done only once), also consider running in venv -python -m pip install pip-tools -python -m piptools compile --generate-hashes -o requirements-310.txt +There are scripts in `ci/scripts` for running Rust and Python linters. + +```shell +./ci/scripts/python_lint.sh +./ci/scripts/rust_clippy.sh +./ci/scripts/rust_fmt.sh +./ci/scripts/rust_toml_fmt.sh ``` -To update dependencies, run with `-U` +## How to update dependencies + +To change test dependencies, change the `pyproject.toml` and run ```bash -python -m piptools compile -U --generate-hashes -o requirements-310.txt +uv sync --dev --no-install-package datafusion ``` - -More details [here](https://github.com/jazzband/pip-tools) diff --git a/benchmarks/db-benchmark/README.md b/benchmarks/db-benchmark/README.md new file mode 100644 index 000000000..8ce45344d --- /dev/null +++ b/benchmarks/db-benchmark/README.md @@ -0,0 +1,32 @@ + + +# DataFusion Implementation of db-benchmark + +This directory contains scripts for running [db-benchmark](https://github.com/duckdblabs/db-benchmark) with +DataFusion's Python bindings. + +## Directions + +Run the following from root of this project. + +```bash +docker build -t db-benchmark -f benchmarks/db-benchmark/db-benchmark.dockerfile . +docker run --privileged -it db-benchmark +``` diff --git a/benchmarks/db-benchmark/db-benchmark.dockerfile b/benchmarks/db-benchmark/db-benchmark.dockerfile new file mode 100644 index 000000000..af2edd0f4 --- /dev/null +++ b/benchmarks/db-benchmark/db-benchmark.dockerfile @@ -0,0 +1,120 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM ubuntu:22.04 +ARG DEBIAN_FRONTEND=noninteractive +ARG TARGETPLATFORM + +# This section is based on https://github.com/duckdblabs/db-benchmark/blob/master/_utils/repro.sh + +RUN apt-get -qq update +RUN apt-get -qq -y upgrade +RUN apt-get -qq install -y apt-utils + +RUN apt-get -qq install -y lsb-release software-properties-common wget curl vim htop git byobu libcurl4-openssl-dev libssl-dev +RUN apt-get -qq install -y libfreetype6-dev +RUN apt-get -qq install -y libfribidi-dev +RUN apt-get -qq install -y libharfbuzz-dev +RUN apt-get -qq install -y git +RUN apt-get -qq install -y libxml2-dev +RUN apt-get -qq install -y make +RUN apt-get -qq install -y libfontconfig1-dev +RUN apt-get -qq install -y libicu-dev pandoc zlib1g-dev libgit2-dev libcurl4-openssl-dev libssl-dev libjpeg-dev libpng-dev libtiff-dev +# apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository "deb [arch=amd64,i386] https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" + +RUN apt-get -qq install -y r-base-dev virtualenv + +RUN cd /usr/local/lib/R && \ + chmod o+w site-library + +RUN cd / && \ + git clone https://github.com/duckdblabs/db-benchmark.git + +WORKDIR /db-benchmark + +RUN mkdir -p .R && \ + echo 'CFLAGS=-O3 -mtune=native' >> .R/Makevars && \ + echo 'CXXFLAGS=-O3 -mtune=native' >> .R/Makevars + +RUN cd pydatatable && \ + virtualenv py-pydatatable --python=/usr/bin/python3.10 +RUN cd pandas && \ + virtualenv py-pandas --python=/usr/bin/python3.10 +RUN cd modin && \ + virtualenv py-modin --python=/usr/bin/python3.10 + +RUN Rscript -e 'install.packages(c("jsonlite","bit64","devtools","rmarkdown"), dependencies=TRUE, repos="https://cloud.r-project.org")' + +SHELL ["/bin/bash", "-c"] + +RUN source ./pandas/py-pandas/bin/activate && \ + python3 -m pip install --upgrade psutil && \ + python3 -m pip install --upgrade pandas && \ + deactivate + +RUN source ./modin/py-modin/bin/activate && \ + python3 -m pip install --upgrade modin && \ + deactivate + +RUN source ./pydatatable/py-pydatatable/bin/activate && \ + python3 -m pip install --upgrade git+https://github.com/h2oai/datatable && \ + deactivate + +## install dplyr +#RUN Rscript -e 'devtools::install_github(c("tidyverse/readr","tidyverse/dplyr"))' + +# install data.table +RUN Rscript -e 'install.packages("data.table", repos="https://rdatatable.gitlab.io/data.table/")' + +## generate data for groupby 0.5GB +RUN Rscript _data/groupby-datagen.R 1e7 1e2 0 0 +RUN #Rscript _data/groupby-datagen.R 1e8 1e2 0 0 +RUN #Rscript _data/groupby-datagen.R 1e9 1e2 0 0 + +RUN mkdir data && \ + mv G1_1e7_1e2_0_0.csv data/ + +# set only groupby task +RUN echo "Changing run.conf and _control/data.csv to run only groupby at 0.5GB" && \ + cp run.conf run.conf.original && \ + sed -i 's/groupby join groupby2014/groupby/g' run.conf && \ + sed -i 's/data.table dplyr pandas pydatatable spark dask clickhouse polars arrow duckdb/data.table dplyr duckdb/g' run.conf && \ + sed -i 's/DO_PUBLISH=true/DO_PUBLISH=false/g' run.conf + +## set sizes +RUN mv _control/data.csv _control/data.csv.original && \ + echo "task,data,nrow,k,na,sort,active" > _control/data.csv && \ + echo "groupby,G1_1e7_1e2_0_0,1e7,1e2,0,0,1" >> _control/data.csv + +RUN #./dplyr/setup-dplyr.sh +RUN #./datatable/setup-datatable.sh +RUN #./duckdb/setup-duckdb.sh + +# END OF SETUP + +RUN python3 -m pip install --upgrade pandas +RUN python3 -m pip install --upgrade polars psutil +RUN python3 -m pip install --upgrade datafusion + +# Now add our solution +RUN rm -rf datafusion-python 2>/dev/null && \ + mkdir datafusion-python +ADD benchmarks/db-benchmark/*.py datafusion-python/ +ADD benchmarks/db-benchmark/run-bench.sh . + +ENTRYPOINT [ "/db-benchmark/run-bench.sh" ] \ No newline at end of file diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py new file mode 100644 index 000000000..533166695 --- /dev/null +++ b/benchmarks/db-benchmark/groupby-datafusion.py @@ -0,0 +1,527 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import gc +import os +import timeit +from pathlib import Path + +import datafusion as df +import pyarrow as pa +from datafusion import ( + RuntimeEnvBuilder, + SessionConfig, + SessionContext, + col, +) +from datafusion import ( + functions as f, +) +from pyarrow import csv as pacsv + +print("# groupby-datafusion.py", flush=True) + +exec(Path.open("./_helpers/helpers.py").read()) + + +def ans_shape(batches) -> tuple[int, int]: + rows, cols = 0, 0 + for batch in batches: + rows += batch.num_rows + if cols == 0: + cols = batch.num_columns + else: + assert cols == batch.num_columns + return rows, cols + + +def execute(df) -> list: + print(df.execution_plan().display_indent()) + return df.collect() + + +ver = df.__version__ +git = "" +task = "groupby" +solution = "datafusion" +fun = ".groupby" +cache = "TRUE" +on_disk = "FALSE" + +# experimental - support running with both DataFrame and SQL APIs +sql = True + +data_name = os.environ["SRC_DATANAME"] +src_grp = "data" / data_name / ".csv" +print("loading dataset %s" % src_grp, flush=True) + +schema = pa.schema( + [ + ("id4", pa.int32()), + ("id5", pa.int32()), + ("id6", pa.int32()), + ("v1", pa.int32()), + ("v2", pa.int32()), + ("v3", pa.float64()), + ] +) + +data = pacsv.read_csv( + src_grp, + convert_options=pacsv.ConvertOptions(auto_dict_encode=True, column_types=schema), +) +print("dataset loaded") + +# create a session context with explicit runtime and config settings +runtime = ( + RuntimeEnvBuilder() + .with_disk_manager_os() + .with_fair_spill_pool(64 * 1024 * 1024 * 1024) +) +config = ( + SessionConfig() + .with_repartition_joins(enabled=False) + .with_repartition_aggregations(enabled=False) + .set("datafusion.execution.coalesce_batches", "false") +) +ctx = SessionContext(config, runtime) +print(ctx) + +ctx.register_record_batches("x", [data.to_batches()]) +print("registered record batches") +# cols = ctx.sql("SHOW columns from x") +# ans.show() + +in_rows = data.num_rows +# print(in_rows, flush=True) + +task_init = timeit.default_timer() + +question = "sum v1 by id1" # q1 +gc.collect() +t_start = timeit.default_timer() +if sql: + df = ctx.sql("SELECT id1, SUM(v1) AS v1 FROM x GROUP BY id1") +else: + df = ctx.table("x").aggregate([f.col("id1")], [f.sum(f.col("v1")).alias("v1")]) +ans = execute(df) + +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q1: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v1"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "sum v1 by id1:id2" # q2 +gc.collect() +t_start = timeit.default_timer() +if sql: + df = ctx.sql("SELECT id1, id2, SUM(v1) AS v1 FROM x GROUP BY id1, id2") +else: + df = ctx.table("x").aggregate( + [f.col("id1"), f.col("id2")], [f.sum(f.col("v1")).alias("v1")] + ) +ans = execute(df) +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q2: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v1"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "sum v1 mean v3 by id3" # q3 +gc.collect() +t_start = timeit.default_timer() +if sql: + df = ctx.sql("SELECT id3, SUM(v1) AS v1, AVG(v3) AS v3 FROM x GROUP BY id3") +else: + df = ctx.table("x").aggregate( + [f.col("id3")], + [ + f.sum(f.col("v1")).alias("v1"), + f.avg(f.col("v3")).alias("v3"), + ], + ) +ans = execute(df) +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q3: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = ( + df.aggregate([], [f.sum(col("v1")), f.sum(col("v3"))]) + .collect()[0] + .to_pandas() + .to_numpy()[0] +) +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "mean v1:v3 by id4" # q4 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT id4, AVG(v1) AS v1, AVG(v2) AS v2, AVG(v3) AS v3 FROM x GROUP BY id4" +).collect() +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q4: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = ( + df.aggregate([], [f.sum(col("v1")), f.sum(col("v2")), f.sum(col("v3"))]) + .collect()[0] + .to_pandas() + .to_numpy()[0] +) +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "sum v1:v3 by id6" # q5 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT id6, SUM(v1) AS v1, SUM(v2) AS v2, SUM(v3) AS v3 FROM x GROUP BY id6" +).collect() +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q5: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = ( + df.aggregate([], [f.sum(col("v1")), f.sum(col("v2")), f.sum(col("v3"))]) + .collect()[0] + .to_pandas() + .to_numpy()[0] +) +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "median v3 sd v3 by id4 id5" # q6 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT id4, id5, approx_percentile_cont(v3, .5) AS median_v3, stddev(v3) AS stddev_v3 FROM x GROUP BY id4, id5" +).collect() +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q6: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = ( + df.aggregate([], [f.sum(col("median_v3")), f.sum(col("stddev_v3"))]) + .collect()[0] + .to_pandas() + .to_numpy()[0] +) +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "max v1 - min v2 by id3" # q7 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT id3, MAX(v1) - MIN(v2) AS range_v1_v2 FROM x GROUP BY id3" +).collect() +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q7: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("range_v1_v2"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "largest two v3 by id6" # q8 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT id6, v3 from (SELECT id6, v3, row_number() OVER (PARTITION BY id6 ORDER BY v3 DESC) AS row FROM x) t WHERE row <= 2" +).collect() +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q8: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v3"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "regression v1 v2 by id2 id4" # q9 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql("SELECT corr(v1, v2) as corr FROM x GROUP BY id2, id4").collect() +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q9: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("corr"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "sum v3 count by id1:id6" # q10 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT id1, id2, id3, id4, id5, id6, SUM(v3) as v3, COUNT(*) AS cnt FROM x GROUP BY id1, id2, id3, id4, id5, id6" +).collect() +shape = ans_shape(ans) +print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q10: {t}") +m = memory_usage() +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = ( + df.aggregate([], [f.sum(col("v3")), f.sum(col("cnt"))]) + .collect()[0] + .to_pandas() + .to_numpy()[0] +) +chkt = timeit.default_timer() - t_start +write_log( + task=task, + data=data_name, + in_rows=in_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +print( + "grouping finished, took %0.fs" % (timeit.default_timer() - task_init), + flush=True, +) + +exit(0) diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py new file mode 100755 index 000000000..3be296c81 --- /dev/null +++ b/benchmarks/db-benchmark/join-datafusion.py @@ -0,0 +1,299 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import gc +import os +import timeit +from pathlib import Path + +import datafusion as df +from datafusion import col +from datafusion import functions as f +from pyarrow import csv as pacsv + +print("# join-datafusion.py", flush=True) + +exec(Path.open("./_helpers/helpers.py").read()) + + +def ans_shape(batches) -> tuple[int, int]: + rows, cols = 0, 0 + for batch in batches: + rows += batch.num_rows + if cols == 0: + cols = batch.num_columns + else: + assert cols == batch.num_columns + return rows, cols + + +ver = df.__version__ +task = "join" +git = "" +solution = "datafusion" +fun = ".join" +cache = "TRUE" +on_disk = "FALSE" + +data_name = os.environ["SRC_DATANAME"] +src_jn_x = "data" / data_name / ".csv" +y_data_name = join_to_tbls(data_name) +src_jn_y = [ + "data" / y_data_name[0] / ".csv", + "data" / y_data_name[1] / ".csv", + "data" / y_data_name[2] / ".csv", +] +if len(src_jn_y) != 3: + error_msg = "Something went wrong in preparing files used for join" + raise Exception(error_msg) + +print( + "loading datasets " + + data_name + + ", " + + y_data_name[0] + + ", " + + y_data_name[1] + + ", " + + y_data_name[2], + flush=True, +) + +ctx = df.SessionContext() +print(ctx) + +# TODO we should be applying projections to these table reads to create relations +# of different sizes + +x_data = pacsv.read_csv( + src_jn_x, convert_options=pacsv.ConvertOptions(auto_dict_encode=True) +) +ctx.register_record_batches("x", [x_data.to_batches()]) +small_data = pacsv.read_csv( + src_jn_y[0], convert_options=pacsv.ConvertOptions(auto_dict_encode=True) +) +ctx.register_record_batches("small", [small_data.to_batches()]) +medium_data = pacsv.read_csv( + src_jn_y[1], convert_options=pacsv.ConvertOptions(auto_dict_encode=True) +) +ctx.register_record_batches("medium", [medium_data.to_batches()]) +large_data = pacsv.read_csv( + src_jn_y[2], convert_options=pacsv.ConvertOptions(auto_dict_encode=True) +) +ctx.register_record_batches("large", [large_data.to_batches()]) + +print(x_data.num_rows, flush=True) +print(small_data.num_rows, flush=True) +print(medium_data.num_rows, flush=True) +print(large_data.num_rows, flush=True) + +task_init = timeit.default_timer() +print("joining...", flush=True) + +question = "small inner on int" # q1 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT x.id1, x.id2, x.id3, x.id4 as xid4, small.id4 as smallid4, x.id5, x.id6, x.v1, small.v2 FROM x INNER JOIN small ON x.id1 = small.id1" +).collect() +# ans = ctx.sql("SELECT * FROM x INNER JOIN small ON x.id1 = small.id1").collect() +# print(set([b.schema for b in ans])) +shape = ans_shape(ans) +# print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q1: {t}") +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v1"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +m = memory_usage() +write_log( + task=task, + data=data_name, + in_rows=x_data.num_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "medium inner on int" # q2 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x INNER JOIN medium ON x.id2 = medium.id2" +).collect() +shape = ans_shape(ans) +# print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q2: {t}") +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +m = memory_usage() +write_log( + task=task, + data=data_name, + in_rows=x_data.num_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "medium outer on int" # q3 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x LEFT JOIN medium ON x.id2 = medium.id2" +).collect() +shape = ans_shape(ans) +# print(shape, flush=True) +t = timeit.default_timer() - t_start +print(f"q3: {t}") +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +m = memory_usage() +write_log( + task=task, + data=data_name, + in_rows=x_data.num_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "medium inner on factor" # q4 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x LEFT JOIN medium ON x.id5 = medium.id5" +).collect() +shape = ans_shape(ans) +# print(shape) +t = timeit.default_timer() - t_start +print(f"q4: {t}") +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +m = memory_usage() +write_log( + task=task, + data=data_name, + in_rows=x_data.num_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +question = "big inner on int" # q5 +gc.collect() +t_start = timeit.default_timer() +ans = ctx.sql( + "SELECT x.id1 as xid1, large.id1 as largeid1, x.id2 as xid2, large.id2 as largeid2, x.id3, x.id4 as xid4, large.id4 as largeid4, x.id5 as xid5, large.id5 as largeid5, x.id6 as xid6, large.id6 as largeid6, x.v1, large.v2 FROM x LEFT JOIN large ON x.id3 = large.id3" +).collect() +shape = ans_shape(ans) +# print(shape) +t = timeit.default_timer() - t_start +print(f"q5: {t}") +t_start = timeit.default_timer() +df = ctx.create_dataframe([ans]) +chk = df.aggregate([], [f.sum(col("v1")), f.sum(col("v2"))]).collect()[0].column(0)[0] +chkt = timeit.default_timer() - t_start +m = memory_usage() +write_log( + task=task, + data=data_name, + in_rows=x_data.num_rows, + question=question, + out_rows=shape[0], + out_cols=shape[1], + solution=solution, + version=ver, + git=git, + fun=fun, + run=1, + time_sec=t, + mem_gb=m, + cache=cache, + chk=make_chk([chk]), + chk_time_sec=chkt, + on_disk=on_disk, +) +del ans +gc.collect() + +print( + "joining finished, took %0.fs" % (timeit.default_timer() - task_init), + flush=True, +) + +exit(0) diff --git a/benchmarks/db-benchmark/run-bench.sh b/benchmarks/db-benchmark/run-bench.sh new file mode 100755 index 000000000..36a6087d9 --- /dev/null +++ b/benchmarks/db-benchmark/run-bench.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +set -e + +#SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/polars/groupby-polars.py +SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/datafusion-python/groupby-datafusion.py + +# joins need more work still +#SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/datafusion-python/join-datafusion.py +#SRC_DATANAME=G1_1e7_1e2_0_0 python3 /db-benchmark/polars/join-polars.py + +cat time.csv diff --git a/benchmarks/max_cpu_usage.py b/benchmarks/max_cpu_usage.py new file mode 100644 index 000000000..ae73baad6 --- /dev/null +++ b/benchmarks/max_cpu_usage.py @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Benchmark script showing how to maximize CPU usage. + +This script demonstrates one example of tuning DataFusion for improved parallelism +and CPU utilization. It uses synthetic in-memory data and performs simple aggregation +operations to showcase the impact of partitioning configuration. + +IMPORTANT: This is a simplified example designed to illustrate partitioning concepts. +Actual performance in your applications may vary significantly based on many factors: + +- Type of table providers (Parquet files, CSV, databases, etc.) +- I/O operations and storage characteristics (local disk, network, cloud storage) +- Query complexity and operation types (joins, window functions, complex expressions) +- Data distribution and size characteristics +- Memory available and hardware specifications +- Network latency for distributed data sources + +It is strongly recommended that you create similar benchmarks tailored to your specific: +- Hardware configuration +- Data sources and formats +- Typical query patterns and workloads +- Performance requirements + +This will give you more accurate insights into how DataFusion configuration options +will affect your particular use case. +""" + +from __future__ import annotations + +import argparse +import multiprocessing +import time + +import pyarrow as pa +from datafusion import SessionConfig, SessionContext, col +from datafusion import functions as f + + +def main(num_rows: int, partitions: int) -> None: + """Run a simple aggregation after repartitioning. + + This function demonstrates basic partitioning concepts using synthetic data. + Real-world performance will depend on your specific data sources, query types, + and system configuration. + """ + # Create some example data (synthetic in-memory data for demonstration) + # Note: Real applications typically work with files, databases, or other + # data sources that have different I/O and distribution characteristics + array = pa.array(range(num_rows)) + batch = pa.record_batch([array], names=["a"]) + + # Configure the session to use a higher target partition count and + # enable automatic repartitioning. + config = ( + SessionConfig() + .with_target_partitions(partitions) + .with_repartition_joins(enabled=True) + .with_repartition_aggregations(enabled=True) + .with_repartition_windows(enabled=True) + ) + ctx = SessionContext(config) + + # Register the input data and repartition manually to ensure that all + # partitions are used. + df = ctx.create_dataframe([[batch]]).repartition(partitions) + + start = time.time() + df = df.aggregate([], [f.sum(col("a"))]) + df.collect() + end = time.time() + + print( + f"Processed {num_rows} rows using {partitions} partitions in {end - start:.3f}s" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--rows", + type=int, + default=1_000_000, + help="Number of rows in the generated dataset", + ) + parser.add_argument( + "--partitions", + type=int, + default=multiprocessing.cpu_count(), + help="Target number of partitions to use", + ) + args = parser.parse_args() + main(args.rows, args.partitions) diff --git a/benchmarks/tpch/.gitignore b/benchmarks/tpch/.gitignore new file mode 100644 index 000000000..4471c6d15 --- /dev/null +++ b/benchmarks/tpch/.gitignore @@ -0,0 +1,2 @@ +data +results.csv \ No newline at end of file diff --git a/benchmarks/tpch/README.md b/benchmarks/tpch/README.md new file mode 100644 index 000000000..a118a7449 --- /dev/null +++ b/benchmarks/tpch/README.md @@ -0,0 +1,78 @@ + + +# DataFusion Python Benchmarks Derived from TPC-H + +## Create Release Build + +From repo root: + +```bash +maturin develop --release +``` + +Note that release builds take a really long time, so you may want to temporarily comment out this section of the +root Cargo.toml when frequently building. + +```toml +[profile.release] +lto = true +codegen-units = 1 +``` + +## Generate Data + +```bash +./tpch-gen.sh 1 +``` + +## Run Benchmarks + +```bash +python tpch.py ./data ./queries +``` + +A summary of the benchmark timings will be written to `results.csv`. For example: + +```csv +setup,1.4 +q1,2978.6 +q2,679.7 +q3,2943.7 +q4,2894.9 +q5,3592.3 +q6,1691.4 +q7,3003.9 +q8,3818.7 +q9,4237.9 +q10,2344.7 +q11,526.1 +q12,2284.6 +q13,1009.2 +q14,1738.4 +q15,1942.1 +q16,499.8 +q17,5178.9 +q18,4127.7 +q19,2056.6 +q20,2162.5 +q21,8046.5 +q22,754.9 +total,58513.2 +``` \ No newline at end of file diff --git a/benchmarks/tpch/create_tables.sql b/benchmarks/tpch/create_tables.sql new file mode 100644 index 000000000..9f3aeea20 --- /dev/null +++ b/benchmarks/tpch/create_tables.sql @@ -0,0 +1,143 @@ +-- Schema derived from TPC-H schema under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. + +CREATE EXTERNAL TABLE customer ( + c_custkey INT NOT NULL, + c_name VARCHAR NOT NULL, + c_address VARCHAR NOT NULL, + c_nationkey INT NOT NULL, + c_phone VARCHAR NOT NULL, + c_acctbal DECIMAL(15, 2) NOT NULL, + c_mktsegment VARCHAR NOT NULL, + c_comment VARCHAR NOT NULL, + c_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/customer.csv'; + +CREATE EXTERNAL TABLE lineitem ( + l_orderkey INT NOT NULL, + l_partkey INT NOT NULL, + l_suppkey INT NOT NULL, + l_linenumber INT NOT NULL, + l_quantity DECIMAL(15, 2) NOT NULL, + l_extendedprice DECIMAL(15, 2) NOT NULL, + l_discount DECIMAL(15, 2) NOT NULL, + l_tax DECIMAL(15, 2) NOT NULL, + l_returnflag VARCHAR NOT NULL, + l_linestatus VARCHAR NOT NULL, + l_shipdate DATE NOT NULL, + l_commitdate DATE NOT NULL, + l_receiptdate DATE NOT NULL, + l_shipinstruct VARCHAR NOT NULL, + l_shipmode VARCHAR NOT NULL, + l_comment VARCHAR NOT NULL, + l_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/lineitem.csv'; + +CREATE EXTERNAL TABLE nation ( + n_nationkey INT NOT NULL, + n_name VARCHAR NOT NULL, + n_regionkey INT NOT NULL, + n_comment VARCHAR NOT NULL, + n_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/nation.csv'; + +CREATE EXTERNAL TABLE orders ( + o_orderkey INT NOT NULL, + o_custkey INT NOT NULL, + o_orderstatus VARCHAR NOT NULL, + o_totalprice DECIMAL(15, 2) NOT NULL, + o_orderdate DATE NOT NULL, + o_orderpriority VARCHAR NOT NULL, + o_clerk VARCHAR NOT NULL, + o_shippriority INT NULL, + o_comment VARCHAR NOT NULL, + o_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/orders.csv'; + +CREATE EXTERNAL TABLE part ( + p_partkey INT NOT NULL, + p_name VARCHAR NOT NULL, + p_mfgr VARCHAR NOT NULL, + p_brand VARCHAR NOT NULL, + p_type VARCHAR NOT NULL, + p_size INT NULL, + p_container VARCHAR NOT NULL, + p_retailprice DECIMAL(15, 2) NOT NULL, + p_comment VARCHAR NOT NULL, + p_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/part.csv'; + +CREATE EXTERNAL TABLE partsupp ( + ps_partkey INT NOT NULL, + ps_suppkey INT NOT NULL, + ps_availqty INT NOT NULL, + ps_supplycost DECIMAL(15, 2) NOT NULL, + ps_comment VARCHAR NOT NULL, + ps_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/partsupp.csv'; + +CREATE EXTERNAL TABLE region ( + r_regionkey INT NOT NULL, + r_name VARCHAR NOT NULL, + r_comment VARCHAR NOT NULL, + r_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/region.csv'; + +CREATE EXTERNAL TABLE supplier ( + s_suppkey INT NOT NULL, + s_name VARCHAR NOT NULL, + s_address VARCHAR NOT NULL, + s_nationkey INT NOT NULL, + s_phone VARCHAR NOT NULL, + s_acctbal DECIMAL(15, 2) NOT NULL, + s_comment VARCHAR NOT NULL, + s_extra VARCHAR NOT NULL, +) +STORED AS CSV +OPTIONS ( + format.delimiter '|', + format.has_header true +) +LOCATION '$PATH/supplier.csv'; \ No newline at end of file diff --git a/benchmarks/tpch/queries/q1.sql b/benchmarks/tpch/queries/q1.sql new file mode 100644 index 000000000..e7e8e32b8 --- /dev/null +++ b/benchmarks/tpch/queries/q1.sql @@ -0,0 +1,23 @@ +-- Benchmark Query 1 derived from TPC-H query 1 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + lineitem +where + l_shipdate <= date '1998-12-01' - interval '68 days' +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus; diff --git a/benchmarks/tpch/queries/q10.sql b/benchmarks/tpch/queries/q10.sql new file mode 100644 index 000000000..8391f6277 --- /dev/null +++ b/benchmarks/tpch/queries/q10.sql @@ -0,0 +1,33 @@ +-- Benchmark Query 10 derived from TPC-H query 10 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + c_custkey, + c_name, + sum(l_extendedprice * (1 - l_discount)) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + customer, + orders, + lineitem, + nation +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-07-01' + and o_orderdate < date '1993-07-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc limit 20; diff --git a/benchmarks/tpch/queries/q11.sql b/benchmarks/tpch/queries/q11.sql new file mode 100644 index 000000000..58776d369 --- /dev/null +++ b/benchmarks/tpch/queries/q11.sql @@ -0,0 +1,29 @@ +-- Benchmark Query 11 derived from TPC-H query 11 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + ps_partkey, + sum(ps_supplycost * ps_availqty) as value +from + partsupp, + supplier, + nation +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001000000 + from + partsupp, + supplier, + nation + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'ALGERIA' + ) +order by + value desc; diff --git a/benchmarks/tpch/queries/q12.sql b/benchmarks/tpch/queries/q12.sql new file mode 100644 index 000000000..0b973de98 --- /dev/null +++ b/benchmarks/tpch/queries/q12.sql @@ -0,0 +1,30 @@ +-- Benchmark Query 12 derived from TPC-H query 12 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + orders, + lineitem +where + o_orderkey = l_orderkey + and l_shipmode in ('FOB', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1995-01-01' + and l_receiptdate < date '1995-01-01' + interval '1' year +group by + l_shipmode +order by + l_shipmode; diff --git a/benchmarks/tpch/queries/q13.sql b/benchmarks/tpch/queries/q13.sql new file mode 100644 index 000000000..145dd6f10 --- /dev/null +++ b/benchmarks/tpch/queries/q13.sql @@ -0,0 +1,22 @@ +-- Benchmark Query 13 derived from TPC-H query 13 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + c_count, + count(*) as custdist +from + ( + select + c_custkey, + count(o_orderkey) + from + customer left outer join orders on + c_custkey = o_custkey + and o_comment not like '%express%requests%' + group by + c_custkey + ) as c_orders (c_custkey, c_count) +group by + c_count +order by + custdist desc, + c_count desc; diff --git a/benchmarks/tpch/queries/q14.sql b/benchmarks/tpch/queries/q14.sql new file mode 100644 index 000000000..1a91a04df --- /dev/null +++ b/benchmarks/tpch/queries/q14.sql @@ -0,0 +1,15 @@ +-- Benchmark Query 14 derived from TPC-H query 14 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + 100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue +from + lineitem, + part +where + l_partkey = p_partkey + and l_shipdate >= date '1995-02-01' + and l_shipdate < date '1995-02-01' + interval '1' month; diff --git a/benchmarks/tpch/queries/q15.sql b/benchmarks/tpch/queries/q15.sql new file mode 100644 index 000000000..68cc32cb7 --- /dev/null +++ b/benchmarks/tpch/queries/q15.sql @@ -0,0 +1,33 @@ +-- Benchmark Query 15 derived from TPC-H query 15 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +create view revenue0 (supplier_no, total_revenue) as + select + l_suppkey, + sum(l_extendedprice * (1 - l_discount)) + from + lineitem + where + l_shipdate >= date '1996-08-01' + and l_shipdate < date '1996-08-01' + interval '3' month + group by + l_suppkey; +select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue +from + supplier, + revenue0 +where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue0 + ) +order by + s_suppkey; +drop view revenue0; diff --git a/benchmarks/tpch/queries/q16.sql b/benchmarks/tpch/queries/q16.sql new file mode 100644 index 000000000..098b4f3b3 --- /dev/null +++ b/benchmarks/tpch/queries/q16.sql @@ -0,0 +1,32 @@ +-- Benchmark Query 16 derived from TPC-H query 16 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + partsupp, + part +where + p_partkey = ps_partkey + and p_brand <> 'Brand#14' + and p_type not like 'SMALL PLATED%' + and p_size in (14, 6, 5, 31, 49, 15, 41, 47) + and ps_suppkey not in ( + select + s_suppkey + from + supplier + where + s_comment like '%Customer%Complaints%' + ) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size; diff --git a/benchmarks/tpch/queries/q17.sql b/benchmarks/tpch/queries/q17.sql new file mode 100644 index 000000000..ed02d7b77 --- /dev/null +++ b/benchmarks/tpch/queries/q17.sql @@ -0,0 +1,19 @@ +-- Benchmark Query 17 derived from TPC-H query 17 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + sum(l_extendedprice) / 7.0 as avg_yearly +from + lineitem, + part +where + p_partkey = l_partkey + and p_brand = 'Brand#42' + and p_container = 'LG BAG' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + lineitem + where + l_partkey = p_partkey + ); diff --git a/benchmarks/tpch/queries/q18.sql b/benchmarks/tpch/queries/q18.sql new file mode 100644 index 000000000..cf1f8c89a --- /dev/null +++ b/benchmarks/tpch/queries/q18.sql @@ -0,0 +1,34 @@ +-- Benchmark Query 18 derived from TPC-H query 18 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice, + sum(l_quantity) +from + customer, + orders, + lineitem +where + o_orderkey in ( + select + l_orderkey + from + lineitem + group by + l_orderkey having + sum(l_quantity) > 313 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate limit 100; diff --git a/benchmarks/tpch/queries/q19.sql b/benchmarks/tpch/queries/q19.sql new file mode 100644 index 000000000..3968f0d24 --- /dev/null +++ b/benchmarks/tpch/queries/q19.sql @@ -0,0 +1,37 @@ +-- Benchmark Query 19 derived from TPC-H query 19 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + sum(l_extendedprice* (1 - l_discount)) as revenue +from + lineitem, + part +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#21' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 8 and l_quantity <= 8 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#13' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 20 and l_quantity <= 20 + 10 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#52' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 30 and l_quantity <= 30 + 10 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ); diff --git a/benchmarks/tpch/queries/q2.sql b/benchmarks/tpch/queries/q2.sql new file mode 100644 index 000000000..46ec5d239 --- /dev/null +++ b/benchmarks/tpch/queries/q2.sql @@ -0,0 +1,45 @@ +-- Benchmark Query 2 derived from TPC-H query 2 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + part, + supplier, + partsupp, + nation, + region +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 48 + and p_type like '%TIN' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and ps_supplycost = ( + select + min(ps_supplycost) + from + partsupp, + supplier, + nation, + region + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + ) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey limit 100; diff --git a/benchmarks/tpch/queries/q20.sql b/benchmarks/tpch/queries/q20.sql new file mode 100644 index 000000000..5bb16563b --- /dev/null +++ b/benchmarks/tpch/queries/q20.sql @@ -0,0 +1,39 @@ +-- Benchmark Query 20 derived from TPC-H query 20 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + s_name, + s_address +from + supplier, + nation +where + s_suppkey in ( + select + ps_suppkey + from + partsupp + where + ps_partkey in ( + select + p_partkey + from + part + where + p_name like 'blanched%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + lineitem + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1993-01-01' + and l_shipdate < date '1993-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'KENYA' +order by + s_name; diff --git a/benchmarks/tpch/queries/q21.sql b/benchmarks/tpch/queries/q21.sql new file mode 100644 index 000000000..6f84b876e --- /dev/null +++ b/benchmarks/tpch/queries/q21.sql @@ -0,0 +1,41 @@ +-- Benchmark Query 21 derived from TPC-H query 21 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + s_name, + count(*) as numwait +from + supplier, + lineitem l1, + orders, + nation +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + lineitem l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + lineitem l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'ARGENTINA' +group by + s_name +order by + numwait desc, + s_name limit 100; diff --git a/benchmarks/tpch/queries/q22.sql b/benchmarks/tpch/queries/q22.sql new file mode 100644 index 000000000..65ea49b04 --- /dev/null +++ b/benchmarks/tpch/queries/q22.sql @@ -0,0 +1,39 @@ +-- Benchmark Query 22 derived from TPC-H query 22 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from + ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + customer + where + substring(c_phone from 1 for 2) in + ('24', '34', '16', '30', '33', '14', '13') + and c_acctbal > ( + select + avg(c_acctbal) + from + customer + where + c_acctbal > 0.00 + and substring(c_phone from 1 for 2) in + ('24', '34', '16', '30', '33', '14', '13') + ) + and not exists ( + select + * + from + orders + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode; diff --git a/benchmarks/tpch/queries/q3.sql b/benchmarks/tpch/queries/q3.sql new file mode 100644 index 000000000..161f2e1e4 --- /dev/null +++ b/benchmarks/tpch/queries/q3.sql @@ -0,0 +1,24 @@ +-- Benchmark Query 3 derived from TPC-H query 3 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + customer, + orders, + lineitem +where + c_mktsegment = 'BUILDING' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date '1995-03-15' + and l_shipdate > date '1995-03-15' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate limit 10; diff --git a/benchmarks/tpch/queries/q4.sql b/benchmarks/tpch/queries/q4.sql new file mode 100644 index 000000000..e444dbfce --- /dev/null +++ b/benchmarks/tpch/queries/q4.sql @@ -0,0 +1,23 @@ +-- Benchmark Query 4 derived from TPC-H query 4 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + o_orderpriority, + count(*) as order_count +from + orders +where + o_orderdate >= date '1995-04-01' + and o_orderdate < date '1995-04-01' + interval '3' month + and exists ( + select + * + from + lineitem + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority; diff --git a/benchmarks/tpch/queries/q5.sql b/benchmarks/tpch/queries/q5.sql new file mode 100644 index 000000000..4426bd245 --- /dev/null +++ b/benchmarks/tpch/queries/q5.sql @@ -0,0 +1,26 @@ +-- Benchmark Query 5 derived from TPC-H query 5 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + customer, + orders, + lineitem, + supplier, + nation, + region +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'AFRICA' + and o_orderdate >= date '1994-01-01' + and o_orderdate < date '1994-01-01' + interval '1' year +group by + n_name +order by + revenue desc; diff --git a/benchmarks/tpch/queries/q6.sql b/benchmarks/tpch/queries/q6.sql new file mode 100644 index 000000000..3d6e51cfe --- /dev/null +++ b/benchmarks/tpch/queries/q6.sql @@ -0,0 +1,11 @@ +-- Benchmark Query 6 derived from TPC-H query 6 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + sum(l_extendedprice * l_discount) as revenue +from + lineitem +where + l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + and l_discount between 0.04 - 0.01 and 0.04 + 0.01 + and l_quantity < 24; diff --git a/benchmarks/tpch/queries/q7.sql b/benchmarks/tpch/queries/q7.sql new file mode 100644 index 000000000..6e36ad616 --- /dev/null +++ b/benchmarks/tpch/queries/q7.sql @@ -0,0 +1,41 @@ +-- Benchmark Query 7 derived from TPC-H query 7 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + extract(year from l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'GERMANY' and n2.n_name = 'IRAQ') + or (n1.n_name = 'IRAQ' and n2.n_name = 'GERMANY') + ) + and l_shipdate between date '1995-01-01' and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year; diff --git a/benchmarks/tpch/queries/q8.sql b/benchmarks/tpch/queries/q8.sql new file mode 100644 index 000000000..e28235ed4 --- /dev/null +++ b/benchmarks/tpch/queries/q8.sql @@ -0,0 +1,39 @@ +-- Benchmark Query 8 derived from TPC-H query 8 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + o_year, + sum(case + when nation = 'IRAQ' then volume + else 0 + end) / sum(volume) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + part, + supplier, + lineitem, + orders, + customer, + nation n1, + nation n2, + region + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'MIDDLE EAST' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'LARGE PLATED STEEL' + ) as all_nations +group by + o_year +order by + o_year; diff --git a/benchmarks/tpch/queries/q9.sql b/benchmarks/tpch/queries/q9.sql new file mode 100644 index 000000000..86ae02482 --- /dev/null +++ b/benchmarks/tpch/queries/q9.sql @@ -0,0 +1,34 @@ +-- Benchmark Query 9 derived from TPC-H query 9 under the terms of the TPC Fair Use Policy. +-- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. +select + nation, + o_year, + sum(amount) as sum_profit +from + ( + select + n_name as nation, + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + part, + supplier, + lineitem, + partsupp, + orders, + nation + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%moccasin%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc; diff --git a/benchmarks/tpch/tpch-gen.sh b/benchmarks/tpch/tpch-gen.sh new file mode 100755 index 000000000..139c300a2 --- /dev/null +++ b/benchmarks/tpch/tpch-gen.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +mkdir -p data/answers 2>/dev/null + +set -e + +# If RUN_IN_CI is set, then do not produce verbose output or use an interactive terminal +if [[ -z "${RUN_IN_CI}" ]]; then + TERMINAL_FLAG="-it" + VERBOSE_OUTPUT="-vf" +else + TERMINAL_FLAG="" + VERBOSE_OUTPUT="-f" +fi + +#pushd .. +#. ./dev/build-set-env.sh +#popd + +# Generate data into the ./data directory if it does not already exist +FILE=./data/supplier.tbl +if test -f "$FILE"; then + echo "$FILE exists." +else + docker run -v `pwd`/data:/data $TERMINAL_FLAG --rm ghcr.io/scalytics/tpch-docker:main $VERBOSE_OUTPUT -s $1 + + # workaround for https://github.com/apache/arrow-datafusion/issues/6147 + mv data/customer.tbl data/customer.csv + mv data/lineitem.tbl data/lineitem.csv + mv data/nation.tbl data/nation.csv + mv data/orders.tbl data/orders.csv + mv data/part.tbl data/part.csv + mv data/partsupp.tbl data/partsupp.csv + mv data/region.tbl data/region.csv + mv data/supplier.tbl data/supplier.csv + + ls -l data +fi + +# Copy expected answers (at SF=1) into the ./data/answers directory if it does not already exist +FILE=./data/answers/q1.out +if test -f "$FILE"; then + echo "$FILE exists." +else + docker run -v `pwd`/data:/data $TERMINAL_FLAG --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/" +fi diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py new file mode 100644 index 000000000..ffee5554c --- /dev/null +++ b/benchmarks/tpch/tpch.py @@ -0,0 +1,99 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import time +from pathlib import Path + +from datafusion import SessionContext + + +def bench(data_path, query_path) -> None: + with Path("results.csv").open("w") as results: + # register tables + start = time.time() + total_time_millis = 0 + + # create context + # runtime = ( + # RuntimeEnvBuilder() + # .with_disk_manager_os() + # .with_fair_spill_pool(10000000) + # ) + # config = ( + # SessionConfig() + # .with_create_default_catalog_and_schema(True) + # .with_default_catalog_and_schema("datafusion", "tpch") + # .with_information_schema(True) + # ) + # ctx = SessionContext(config, runtime) + + ctx = SessionContext() + print("Configuration:\n", ctx) + + # register tables + with Path("create_tables.sql").open() as f: + sql = "" + for line in f.readlines(): + if line.startswith("--"): + continue + sql = sql + line + if sql.strip().endswith(";"): + sql = sql.strip().replace("$PATH", data_path) + ctx.sql(sql) + sql = "" + + end = time.time() + time_millis = (end - start) * 1000 + total_time_millis += time_millis + print(f"setup,{round(time_millis, 1)}") + results.write(f"setup,{round(time_millis, 1)}\n") + results.flush() + + # run queries + for query in range(1, 23): + with Path(f"{query_path}/q{query}.sql").open() as f: + text = f.read() + tmp = text.split(";") + queries = [s.strip() for s in tmp if len(s.strip()) > 0] + + try: + start = time.time() + for sql in queries: + print(sql) + df = ctx.sql(sql) + # result_set = df.collect() + df.show() + end = time.time() + time_millis = (end - start) * 1000 + total_time_millis += time_millis + print(f"q{query},{round(time_millis, 1)}") + results.write(f"q{query},{round(time_millis, 1)}\n") + results.flush() + except Exception as e: + print("query", query, "failed", e) + + print(f"total,{round(total_time_millis, 1)}") + results.write(f"total,{round(total_time_millis, 1)}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("data_path") + parser.add_argument("query_path") + args = parser.parse_args() + bench(args.data_path, args.query_path) diff --git a/build.rs b/build.rs new file mode 100644 index 000000000..4878d8b0e --- /dev/null +++ b/build.rs @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +fn main() { + pyo3_build_config::add_extension_module_link_args(); +} diff --git a/datafusion/expr.py b/ci/scripts/python_lint.sh old mode 100644 new mode 100755 similarity index 90% rename from datafusion/expr.py rename to ci/scripts/python_lint.sh index e914b85d7..3f7310ba7 --- a/datafusion/expr.py +++ b/ci/scripts/python_lint.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,9 +17,6 @@ # specific language governing permissions and limitations # under the License. - -from ._internal import expr - - -def __getattr__(name): - return getattr(expr, name) +set -ex +ruff format datafusion +ruff check datafusion \ No newline at end of file diff --git a/ci/scripts/rust_fmt.sh b/ci/scripts/rust_fmt.sh index 9d8325877..05cb6b208 100755 --- a/ci/scripts/rust_fmt.sh +++ b/ci/scripts/rust_fmt.sh @@ -18,4 +18,4 @@ # under the License. set -ex -cargo fmt --all -- --check +cargo +nightly fmt --all -- --check diff --git a/conda/environments/datafusion-dev.yaml b/conda/environments/datafusion-dev.yaml deleted file mode 100644 index d9405e4fe..000000000 --- a/conda/environments/datafusion-dev.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -channels: -- conda-forge -dependencies: -- black -- flake8 -- isort -- maturin -- mypy -- numpy -- pyarrow -- pytest -- toml -- importlib_metadata -- python>=3.10 -# Packages useful for building distributions and releasing -- mamba -- conda-build -- anaconda-client -# Packages for documentation building -- sphinx -- pydata-sphinx-theme==0.8.0 -- myst-parser -- jinja2 -# GPU packages -- cudf -- cudatoolkit=11.8 -name: datafusion-dev diff --git a/conftest.py b/conftest.py new file mode 100644 index 000000000..1c89f92bc --- /dev/null +++ b/conftest.py @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Pytest configuration for doctest namespace injection.""" + +import datafusion as dfn +import numpy as np +import pytest + + +@pytest.fixture(autouse=True) +def _doctest_namespace(doctest_namespace: dict) -> None: + """Add common imports to the doctest namespace.""" + doctest_namespace["dfn"] = dfn + doctest_namespace["np"] = np diff --git a/datafusion/__init__.py b/datafusion/__init__.py deleted file mode 100644 index bb1beacd9..000000000 --- a/datafusion/__init__.py +++ /dev/null @@ -1,219 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from abc import ABCMeta, abstractmethod -from typing import List - -try: - import importlib.metadata as importlib_metadata -except ImportError: - import importlib_metadata - -import pyarrow as pa - -from ._internal import ( - AggregateUDF, - Config, - DataFrame, - SessionContext, - SessionConfig, - RuntimeConfig, - ScalarUDF, -) - -from .common import ( - DFField, - DFSchema, -) - -from .expr import ( - Alias, - Analyze, - Expr, - Filter, - Limit, - Like, - ILike, - Projection, - SimilarTo, - ScalarVariable, - Sort, - TableScan, - GetIndexedField, - Not, - IsNotNull, - IsTrue, - IsFalse, - IsUnknown, - IsNotTrue, - IsNotFalse, - IsNotUnknown, - Negative, - ScalarFunction, - BuiltinScalarFunction, - InList, - Exists, - Subquery, - InSubquery, - ScalarSubquery, - GroupingSet, - Placeholder, - Case, - Cast, - TryCast, - Between, - Explain, - CreateMemoryTable, - SubqueryAlias, - Extension, - CreateView, - Distinct, - DropTable, - Repartition, - Partitioning, -) - -__version__ = importlib_metadata.version(__name__) - -__all__ = [ - "Config", - "DataFrame", - "SessionContext", - "SessionConfig", - "RuntimeConfig", - "Expr", - "AggregateUDF", - "ScalarUDF", - "column", - "literal", - "TableScan", - "Projection", - "DFSchema", - "DFField", - "Analyze", - "Sort", - "Limit", - "Filter", - "Like", - "ILike", - "SimilarTo", - "ScalarVariable", - "Alias", - "GetIndexedField", - "Not", - "IsNotNull", - "IsTrue", - "IsFalse", - "IsUnknown", - "IsNotTrue", - "IsNotFalse", - "IsNotUnknown", - "Negative", - "ScalarFunction", - "BuiltinScalarFunction", - "InList", - "Exists", - "Subquery", - "InSubquery", - "ScalarSubquery", - "GroupingSet", - "Placeholder", - "Case", - "Cast", - "TryCast", - "Between", - "Explain", - "SubqueryAlias", - "Extension", - "CreateMemoryTable", - "CreateView", - "Distinct", - "DropTable", - "Repartition", - "Partitioning", -] - - -class Accumulator(metaclass=ABCMeta): - @abstractmethod - def state(self) -> List[pa.Scalar]: - pass - - @abstractmethod - def update(self, values: pa.Array) -> None: - pass - - @abstractmethod - def merge(self, states: pa.Array) -> None: - pass - - @abstractmethod - def evaluate(self) -> pa.Scalar: - pass - - -def column(value): - return Expr.column(value) - - -col = column - - -def literal(value): - if not isinstance(value, pa.Scalar): - value = pa.scalar(value) - return Expr.literal(value) - - -lit = literal - - -def udf(func, input_types, return_type, volatility, name=None): - """ - Create a new User Defined Function - """ - if not callable(func): - raise TypeError("`func` argument must be callable") - if name is None: - name = func.__qualname__.lower() - return ScalarUDF( - name=name, - func=func, - input_types=input_types, - return_type=return_type, - volatility=volatility, - ) - - -def udaf(accum, input_type, return_type, state_type, volatility, name=None): - """ - Create a new User Defined Aggregate Function - """ - if not issubclass(accum, Accumulator): - raise TypeError( - "`accum` must implement the abstract base class Accumulator" - ) - if name is None: - name = accum.__qualname__.lower() - return AggregateUDF( - name=name, - accumulator=accum, - input_type=input_type, - return_type=return_type, - state_type=state_type, - volatility=volatility, - ) diff --git a/datafusion/cudf.py b/datafusion/cudf.py deleted file mode 100644 index d5f02156f..000000000 --- a/datafusion/cudf.py +++ /dev/null @@ -1,61 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import cudf -import datafusion -from datafusion.expr import Projection, TableScan, Column - - -class SessionContext: - def __init__(self): - self.datafusion_ctx = datafusion.SessionContext() - self.parquet_tables = {} - - def register_parquet(self, name, path): - self.parquet_tables[name] = path - self.datafusion_ctx.register_parquet(name, path) - - def to_cudf_expr(self, expr): - # get Python wrapper for logical expression - expr = expr.to_variant() - - if isinstance(expr, Column): - return expr.name() - else: - raise Exception("unsupported expression: {}".format(expr)) - - def to_cudf_df(self, plan): - # recurse down first to translate inputs into pandas data frames - inputs = [self.to_cudf_df(x) for x in plan.inputs()] - - # get Python wrapper for logical operator node - node = plan.to_variant() - - if isinstance(node, Projection): - args = [self.to_cudf_expr(expr) for expr in node.projections()] - return inputs[0][args] - elif isinstance(node, TableScan): - return cudf.read_parquet(self.parquet_tables[node.table_name()]) - else: - raise Exception( - "unsupported logical operator: {}".format(type(node)) - ) - - def sql(self, sql): - datafusion_df = self.datafusion_ctx.sql(sql) - plan = datafusion_df.logical_plan() - return self.to_cudf_df(plan) diff --git a/datafusion/pandas.py b/datafusion/pandas.py deleted file mode 100644 index f8e56512b..000000000 --- a/datafusion/pandas.py +++ /dev/null @@ -1,61 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import pandas as pd -import datafusion -from datafusion.expr import Projection, TableScan, Column - - -class SessionContext: - def __init__(self): - self.datafusion_ctx = datafusion.SessionContext() - self.parquet_tables = {} - - def register_parquet(self, name, path): - self.parquet_tables[name] = path - self.datafusion_ctx.register_parquet(name, path) - - def to_pandas_expr(self, expr): - # get Python wrapper for logical expression - expr = expr.to_variant() - - if isinstance(expr, Column): - return expr.name() - else: - raise Exception("unsupported expression: {}".format(expr)) - - def to_pandas_df(self, plan): - # recurse down first to translate inputs into pandas data frames - inputs = [self.to_pandas_df(x) for x in plan.inputs()] - - # get Python wrapper for logical operator node - node = plan.to_variant() - - if isinstance(node, Projection): - args = [self.to_pandas_expr(expr) for expr in node.projections()] - return inputs[0][args] - elif isinstance(node, TableScan): - return pd.read_parquet(self.parquet_tables[node.table_name()]) - else: - raise Exception( - "unsupported logical operator: {}".format(type(node)) - ) - - def sql(self, sql): - datafusion_df = self.datafusion_ctx.sql(sql) - plan = datafusion_df.logical_plan() - return self.to_pandas_df(plan) diff --git a/datafusion/polars.py b/datafusion/polars.py deleted file mode 100644 index a1bafbef8..000000000 --- a/datafusion/polars.py +++ /dev/null @@ -1,84 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import polars -import datafusion -from datafusion.expr import Projection, TableScan, Aggregate -from datafusion.expr import Column, AggregateFunction - - -class SessionContext: - def __init__(self): - self.datafusion_ctx = datafusion.SessionContext() - self.parquet_tables = {} - - def register_parquet(self, name, path): - self.parquet_tables[name] = path - self.datafusion_ctx.register_parquet(name, path) - - def to_polars_expr(self, expr): - # get Python wrapper for logical expression - expr = expr.to_variant() - - if isinstance(expr, Column): - return polars.col(expr.name()) - else: - raise Exception("unsupported expression: {}".format(expr)) - - def to_polars_df(self, plan): - # recurse down first to translate inputs into Polars data frames - inputs = [self.to_polars_df(x) for x in plan.inputs()] - - # get Python wrapper for logical operator node - node = plan.to_variant() - - if isinstance(node, Projection): - args = [self.to_polars_expr(expr) for expr in node.projections()] - return inputs[0].select(*args) - elif isinstance(node, Aggregate): - groupby_expr = [ - self.to_polars_expr(expr) for expr in node.group_by_exprs() - ] - aggs = [] - for expr in node.aggregate_exprs(): - expr = expr.to_variant() - if isinstance(expr, AggregateFunction): - if expr.aggregate_type() == "COUNT": - aggs.append(polars.count().alias("{}".format(expr))) - else: - raise Exception( - "Unsupported aggregate function {}".format( - expr.aggregate_type() - ) - ) - else: - raise Exception( - "Unsupported aggregate function {}".format(expr) - ) - df = inputs[0].groupby(groupby_expr).agg(aggs) - return df - elif isinstance(node, TableScan): - return polars.read_parquet(self.parquet_tables[node.table_name()]) - else: - raise Exception( - "unsupported logical operator: {}".format(type(node)) - ) - - def sql(self, sql): - datafusion_df = self.datafusion_ctx.sql(sql) - plan = datafusion_df.logical_plan() - return self.to_polars_df(plan) diff --git a/datafusion/substrait.py b/datafusion/substrait.py deleted file mode 100644 index eff809a0c..000000000 --- a/datafusion/substrait.py +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -from ._internal import substrait - - -def __getattr__(name): - return getattr(substrait, name) diff --git a/datafusion/tests/test_aggregation.py b/datafusion/tests/test_aggregation.py deleted file mode 100644 index 2c8c064b1..000000000 --- a/datafusion/tests/test_aggregation.py +++ /dev/null @@ -1,127 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -import pyarrow as pa -import pytest - -from datafusion import SessionContext, column, lit -from datafusion import functions as f - - -@pytest.fixture -def df(): - ctx = SessionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays( - [ - pa.array([1, 2, 3]), - pa.array([4, 4, 6]), - pa.array([9, 8, 5]), - ], - names=["a", "b", "c"], - ) - return ctx.create_dataframe([[batch]]) - - -def test_built_in_aggregation(df): - col_a = column("a") - col_b = column("b") - col_c = column("c") - - agg_df = df.aggregate( - [], - [ - f.approx_distinct(col_b), - f.approx_median(col_b), - f.approx_percentile_cont(col_b, lit(0.5)), - f.approx_percentile_cont_with_weight(col_b, lit(0.6), lit(0.5)), - f.array_agg(col_b), - f.avg(col_a), - f.corr(col_a, col_b), - f.count(col_a), - f.covar(col_a, col_b), - f.covar_pop(col_a, col_c), - f.covar_samp(col_b, col_c), - # f.grouping(col_a), # No physical plan implemented yet - f.max(col_a), - f.mean(col_b), - f.median(col_b), - f.min(col_a), - f.sum(col_b), - f.stddev(col_a), - f.stddev_pop(col_b), - f.stddev_samp(col_c), - f.var(col_a), - f.var_pop(col_b), - f.var_samp(col_c), - ], - ) - result = agg_df.collect()[0] - values_a, values_b, values_c = df.collect()[0] - - assert result.column(0) == pa.array([2], type=pa.uint64()) - assert result.column(1) == pa.array([4]) - assert result.column(2) == pa.array([4]) - assert result.column(3) == pa.array([6]) - assert result.column(4) == pa.array([[4, 4, 6]]) - np.testing.assert_array_almost_equal( - result.column(5), np.average(values_a) - ) - np.testing.assert_array_almost_equal( - result.column(6), np.corrcoef(values_a, values_b)[0][1] - ) - assert result.column(7) == pa.array([len(values_a)]) - # Sample (co)variance -> ddof=1 - # Population (co)variance -> ddof=0 - np.testing.assert_array_almost_equal( - result.column(8), np.cov(values_a, values_b, ddof=1)[0][1] - ) - np.testing.assert_array_almost_equal( - result.column(9), np.cov(values_a, values_c, ddof=0)[0][1] - ) - np.testing.assert_array_almost_equal( - result.column(10), np.cov(values_b, values_c, ddof=1)[0][1] - ) - np.testing.assert_array_almost_equal(result.column(11), np.max(values_a)) - np.testing.assert_array_almost_equal(result.column(12), np.mean(values_b)) - np.testing.assert_array_almost_equal( - result.column(13), np.median(values_b) - ) - np.testing.assert_array_almost_equal(result.column(14), np.min(values_a)) - np.testing.assert_array_almost_equal( - result.column(15), np.sum(values_b.to_pylist()) - ) - np.testing.assert_array_almost_equal( - result.column(16), np.std(values_a, ddof=1) - ) - np.testing.assert_array_almost_equal( - result.column(17), np.std(values_b, ddof=0) - ) - np.testing.assert_array_almost_equal( - result.column(18), np.std(values_c, ddof=1) - ) - np.testing.assert_array_almost_equal( - result.column(19), np.var(values_a, ddof=1) - ) - np.testing.assert_array_almost_equal( - result.column(20), np.var(values_b, ddof=0) - ) - np.testing.assert_array_almost_equal( - result.column(21), np.var(values_c, ddof=1) - ) diff --git a/datafusion/tests/test_context.py b/datafusion/tests/test_context.py deleted file mode 100644 index 1aea21c21..000000000 --- a/datafusion/tests/test_context.py +++ /dev/null @@ -1,351 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import os - -import pyarrow as pa -import pyarrow.dataset as ds - -from datafusion import ( - column, - literal, - SessionContext, - SessionConfig, - RuntimeConfig, - DataFrame, -) -import pytest - - -def test_create_context_no_args(): - SessionContext() - - -def test_create_context_with_all_valid_args(): - runtime = ( - RuntimeConfig().with_disk_manager_os().with_fair_spill_pool(10000000) - ) - config = ( - SessionConfig() - .with_create_default_catalog_and_schema(True) - .with_default_catalog_and_schema("foo", "bar") - .with_target_partitions(1) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) - ) - - ctx = SessionContext(config, runtime) - - # verify that at least some of the arguments worked - ctx.catalog("foo").database("bar") - with pytest.raises(KeyError): - ctx.catalog("datafusion") - - -def test_register_record_batches(ctx): - # create a RecordBatch and register it as memtable - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - - ctx.register_record_batches("t", [[batch]]) - - assert ctx.tables() == {"t"} - - result = ctx.sql("SELECT a+b, a-b FROM t").collect() - - assert result[0].column(0) == pa.array([5, 7, 9]) - assert result[0].column(1) == pa.array([-3, -3, -3]) - - -def test_create_dataframe_registers_unique_table_name(ctx): - # create a RecordBatch and register it as memtable - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - - df = ctx.create_dataframe([[batch]]) - tables = list(ctx.tables()) - - assert df - assert len(tables) == 1 - assert len(tables[0]) == 33 - assert tables[0].startswith("c") - # ensure that the rest of the table name contains - # only hexadecimal numbers - for c in tables[0][1:]: - assert c in "0123456789abcdef" - - -def test_create_dataframe_registers_with_defined_table_name(ctx): - # create a RecordBatch and register it as memtable - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - - df = ctx.create_dataframe([[batch]], name="tbl") - tables = list(ctx.tables()) - - assert df - assert len(tables) == 1 - assert tables[0] == "tbl" - - -def test_from_arrow_table(ctx): - # create a PyArrow table - data = {"a": [1, 2, 3], "b": [4, 5, 6]} - table = pa.Table.from_pydict(data) - - # convert to DataFrame - df = ctx.from_arrow_table(table) - tables = list(ctx.tables()) - - assert df - assert len(tables) == 1 - assert type(df) == DataFrame - assert set(df.schema().names) == {"a", "b"} - assert df.collect()[0].num_rows == 3 - - -def test_from_arrow_table_with_name(ctx): - # create a PyArrow table - data = {"a": [1, 2, 3], "b": [4, 5, 6]} - table = pa.Table.from_pydict(data) - - # convert to DataFrame with optional name - df = ctx.from_arrow_table(table, name="tbl") - tables = list(ctx.tables()) - - assert df - assert tables[0] == "tbl" - - -def test_from_pylist(ctx): - # create a dataframe from Python list - data = [ - {"a": 1, "b": 4}, - {"a": 2, "b": 5}, - {"a": 3, "b": 6}, - ] - - df = ctx.from_pylist(data) - tables = list(ctx.tables()) - - assert df - assert len(tables) == 1 - assert type(df) == DataFrame - assert set(df.schema().names) == {"a", "b"} - assert df.collect()[0].num_rows == 3 - - -def test_from_pydict(ctx): - # create a dataframe from Python dictionary - data = {"a": [1, 2, 3], "b": [4, 5, 6]} - - df = ctx.from_pydict(data) - tables = list(ctx.tables()) - - assert df - assert len(tables) == 1 - assert type(df) == DataFrame - assert set(df.schema().names) == {"a", "b"} - assert df.collect()[0].num_rows == 3 - - -def test_from_pandas(ctx): - # create a dataframe from pandas dataframe - pd = pytest.importorskip("pandas") - data = {"a": [1, 2, 3], "b": [4, 5, 6]} - pandas_df = pd.DataFrame(data) - - df = ctx.from_pandas(pandas_df) - tables = list(ctx.tables()) - - assert df - assert len(tables) == 1 - assert type(df) == DataFrame - assert set(df.schema().names) == {"a", "b"} - assert df.collect()[0].num_rows == 3 - - -def test_from_polars(ctx): - # create a dataframe from Polars dataframe - pd = pytest.importorskip("polars") - data = {"a": [1, 2, 3], "b": [4, 5, 6]} - polars_df = pd.DataFrame(data) - - df = ctx.from_polars(polars_df) - tables = list(ctx.tables()) - - assert df - assert len(tables) == 1 - assert type(df) == DataFrame - assert set(df.schema().names) == {"a", "b"} - assert df.collect()[0].num_rows == 3 - - -def test_register_table(ctx, database): - default = ctx.catalog() - public = default.database("public") - assert public.names() == {"csv", "csv1", "csv2"} - table = public.table("csv") - - ctx.register_table("csv3", table) - assert public.names() == {"csv", "csv1", "csv2", "csv3"} - - -def test_deregister_table(ctx, database): - default = ctx.catalog() - public = default.database("public") - assert public.names() == {"csv", "csv1", "csv2"} - - ctx.deregister_table("csv") - assert public.names() == {"csv1", "csv2"} - - -def test_register_dataset(ctx): - # create a RecordBatch and register it as a pyarrow.dataset.Dataset - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - dataset = ds.dataset([batch]) - ctx.register_dataset("t", dataset) - - assert ctx.tables() == {"t"} - - result = ctx.sql("SELECT a+b, a-b FROM t").collect() - - assert result[0].column(0) == pa.array([5, 7, 9]) - assert result[0].column(1) == pa.array([-3, -3, -3]) - - -def test_dataset_filter(ctx, capfd): - # create a RecordBatch and register it as a pyarrow.dataset.Dataset - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - dataset = ds.dataset([batch]) - ctx.register_dataset("t", dataset) - - assert ctx.tables() == {"t"} - df = ctx.sql("SELECT a+b, a-b FROM t WHERE a BETWEEN 2 and 3 AND b > 5") - - # Make sure the filter was pushed down in Physical Plan - df.explain() - captured = capfd.readouterr() - assert "filter_expr=(((a >= 2) and (a <= 3)) and (b > 5))" in captured.out - - result = df.collect() - - assert result[0].column(0) == pa.array([9]) - assert result[0].column(1) == pa.array([-3]) - - -def test_dataset_filter_nested_data(ctx): - # create Arrow StructArrays to test nested data types - data = pa.StructArray.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - batch = pa.RecordBatch.from_arrays( - [data], - names=["nested_data"], - ) - dataset = ds.dataset([batch]) - ctx.register_dataset("t", dataset) - - assert ctx.tables() == {"t"} - - df = ctx.table("t") - - # This filter will not be pushed down to DatasetExec since it - # isn't supported - df = df.select( - column("nested_data")["a"] + column("nested_data")["b"], - column("nested_data")["a"] - column("nested_data")["b"], - ).filter(column("nested_data")["b"] > literal(5)) - - result = df.collect() - - assert result[0].column(0) == pa.array([9]) - assert result[0].column(1) == pa.array([-3]) - - -def test_table_exist(ctx): - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - dataset = ds.dataset([batch]) - ctx.register_dataset("t", dataset) - - assert ctx.table_exist("t") is True - - -def test_read_json(ctx): - path = os.path.dirname(os.path.abspath(__file__)) - - # Default - test_data_path = os.path.join(path, "data_test_context", "data.json") - df = ctx.read_json(test_data_path) - result = df.collect() - - assert result[0].column(0) == pa.array(["a", "b", "c"]) - assert result[0].column(1) == pa.array([1, 2, 3]) - - # Schema - schema = pa.schema( - [ - pa.field("A", pa.string(), nullable=True), - ] - ) - df = ctx.read_json(test_data_path, schema=schema) - result = df.collect() - - assert result[0].column(0) == pa.array(["a", "b", "c"]) - assert result[0].schema == schema - - # File extension - test_data_path = os.path.join(path, "data_test_context", "data.json") - df = ctx.read_json(test_data_path, file_extension=".json") - result = df.collect() - - assert result[0].column(0) == pa.array(["a", "b", "c"]) - assert result[0].column(1) == pa.array([1, 2, 3]) - - -def test_read_csv(ctx): - csv_df = ctx.read_csv(path="testing/data/csv/aggregate_test_100.csv") - csv_df.select(column("c1")).show() - - -def test_read_parquet(ctx): - csv_df = ctx.read_parquet(path="parquet/data/alltypes_plain.parquet") - csv_df.show() - - -def test_read_avro(ctx): - csv_df = ctx.read_avro(path="testing/data/avro/alltypes_plain.avro") - csv_df.show() diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py deleted file mode 100644 index 611bcabe4..000000000 --- a/datafusion/tests/test_dataframe.py +++ /dev/null @@ -1,609 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import pyarrow as pa -import pytest - -from datafusion import functions as f -from datafusion import DataFrame, SessionContext, column, literal, udf - - -@pytest.fixture -def ctx(): - return SessionContext() - - -@pytest.fixture -def df(): - ctx = SessionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([8, 5, 8])], - names=["a", "b", "c"], - ) - - return ctx.create_dataframe([[batch]]) - - -@pytest.fixture -def struct_df(): - ctx = SessionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays( - [pa.array([{"c": 1}, {"c": 2}, {"c": 3}]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - - return ctx.create_dataframe([[batch]]) - - -@pytest.fixture -def aggregate_df(): - ctx = SessionContext() - ctx.register_csv("test", "testing/data/csv/aggregate_test_100.csv") - return ctx.sql("select c1, sum(c2) from test group by c1") - - -def test_select(df): - df = df.select( - column("a") + column("b"), - column("a") - column("b"), - ) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pa.array([5, 7, 9]) - assert result.column(1) == pa.array([-3, -3, -3]) - - -def test_select_columns(df): - df = df.select_columns("b", "a") - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pa.array([4, 5, 6]) - assert result.column(1) == pa.array([1, 2, 3]) - - -def test_filter(df): - df = df.select( - column("a") + column("b"), - column("a") - column("b"), - ).filter(column("a") > literal(2)) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pa.array([9]) - assert result.column(1) == pa.array([-3]) - - -def test_sort(df): - df = df.sort(column("b").sort(ascending=False)) - - table = pa.Table.from_batches(df.collect()) - expected = {"a": [3, 2, 1], "b": [6, 5, 4], "c": [8, 5, 8]} - - assert table.to_pydict() == expected - - -def test_limit(df): - df = df.limit(1) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert len(result.column(0)) == 1 - assert len(result.column(1)) == 1 - - -def test_with_column(df): - df = df.with_column("c", column("a") + column("b")) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.schema.field(0).name == "a" - assert result.schema.field(1).name == "b" - assert result.schema.field(2).name == "c" - - assert result.column(0) == pa.array([1, 2, 3]) - assert result.column(1) == pa.array([4, 5, 6]) - assert result.column(2) == pa.array([5, 7, 9]) - - -def test_with_column_renamed(df): - df = df.with_column("c", column("a") + column("b")).with_column_renamed( - "c", "sum" - ) - - result = df.collect()[0] - - assert result.schema.field(0).name == "a" - assert result.schema.field(1).name == "b" - assert result.schema.field(2).name == "sum" - - -def test_udf(df): - # is_null is a pa function over arrays - is_null = udf( - lambda x: x.is_null(), - [pa.int64()], - pa.bool_(), - volatility="immutable", - ) - - df = df.select(is_null(column("a"))) - result = df.collect()[0].column(0) - - assert result == pa.array([False, False, False]) - - -def test_join(): - ctx = SessionContext() - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2]), pa.array([8, 10])], - names=["a", "c"], - ) - df1 = ctx.create_dataframe([[batch]]) - - df = df.join(df1, join_keys=(["a"], ["a"]), how="inner") - df = df.sort(column("a").sort(ascending=True)) - table = pa.Table.from_batches(df.collect()) - - expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]} - assert table.to_pydict() == expected - - -def test_distinct(): - ctx = SessionContext() - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3, 1, 2, 3]), pa.array([4, 5, 6, 4, 5, 6])], - names=["a", "b"], - ) - df_a = ( - ctx.create_dataframe([[batch]]) - .distinct() - .sort(column("a").sort(ascending=True)) - ) - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df_b = ctx.create_dataframe([[batch]]).sort( - column("a").sort(ascending=True) - ) - - assert df_a.collect() == df_b.collect() - - -def test_window_functions(df): - df = df.select( - column("a"), - column("b"), - column("c"), - f.alias( - f.window("row_number", [], order_by=[f.order_by(column("c"))]), - "row", - ), - f.alias( - f.window("rank", [], order_by=[f.order_by(column("c"))]), - "rank", - ), - f.alias( - f.window("dense_rank", [], order_by=[f.order_by(column("c"))]), - "dense_rank", - ), - f.alias( - f.window("percent_rank", [], order_by=[f.order_by(column("c"))]), - "percent_rank", - ), - f.alias( - f.window("cume_dist", [], order_by=[f.order_by(column("b"))]), - "cume_dist", - ), - f.alias( - f.window( - "ntile", [literal(2)], order_by=[f.order_by(column("c"))] - ), - "ntile", - ), - f.alias( - f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]), - "previous", - ), - f.alias( - f.window( - "lead", [column("b")], order_by=[f.order_by(column("b"))] - ), - "next", - ), - f.alias( - f.window( - "first_value", - [column("a")], - order_by=[f.order_by(column("b"))], - ), - "first_value", - ), - f.alias( - f.window( - "last_value", [column("b")], order_by=[f.order_by(column("b"))] - ), - "last_value", - ), - f.alias( - f.window( - "nth_value", - [column("b"), literal(2)], - order_by=[f.order_by(column("b"))], - ), - "2nd_value", - ), - ) - - table = pa.Table.from_batches(df.collect()) - - expected = { - "a": [1, 2, 3], - "b": [4, 5, 6], - "c": [8, 5, 8], - "row": [2, 1, 3], - "rank": [2, 1, 2], - "dense_rank": [2, 1, 2], - "percent_rank": [0.5, 0, 0.5], - "cume_dist": [0.3333333333333333, 0.6666666666666666, 1.0], - "ntile": [1, 1, 2], - "next": [5, 6, None], - "previous": [None, 4, 5], - "first_value": [1, 1, 1], - "last_value": [4, 5, 6], - "2nd_value": [None, 5, 5], - } - assert table.sort_by("a").to_pydict() == expected - - -def test_get_dataframe(tmp_path): - ctx = SessionContext() - - path = tmp_path / "test.csv" - table = pa.Table.from_arrays( - [ - [1, 2, 3, 4], - ["a", "b", "c", "d"], - [1.1, 2.2, 3.3, 4.4], - ], - names=["int", "str", "float"], - ) - pa.csv.write_csv(table, path) - - ctx.register_csv("csv", path) - - df = ctx.table("csv") - assert isinstance(df, DataFrame) - - -def test_struct_select(struct_df): - df = struct_df.select( - column("a")["c"] + column("b"), - column("a")["c"] - column("b"), - ) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pa.array([5, 7, 9]) - assert result.column(1) == pa.array([-3, -3, -3]) - - -def test_explain(df): - df = df.select( - column("a") + column("b"), - column("a") - column("b"), - ) - df.explain() - - -def test_logical_plan(aggregate_df): - plan = aggregate_df.logical_plan() - - expected = "Projection: test.c1, SUM(test.c2)" - - assert expected == plan.display() - - expected = ( - "Projection: test.c1, SUM(test.c2)\n" - " Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n" - " TableScan: test" - ) - - assert expected == plan.display_indent() - - -def test_optimized_logical_plan(aggregate_df): - plan = aggregate_df.optimized_logical_plan() - - expected = "Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]" - - assert expected == plan.display() - - expected = ( - "Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n" - " TableScan: test projection=[c1, c2]" - ) - - assert expected == plan.display_indent() - - -def test_execution_plan(aggregate_df): - plan = aggregate_df.execution_plan() - - expected = "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[SUM(test.c2)]\n" # noqa: E501 - - assert expected == plan.display() - - expected = ( - "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n" - " Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n" - " TableScan: test projection=[c1, c2]" - ) - - indent = plan.display_indent() - - # indent plan will be different for everyone due to absolute path - # to filename, so we just check for some expected content - assert "AggregateExec:" in indent - assert "CoalesceBatchesExec:" in indent - assert "RepartitionExec:" in indent - assert "CsvExec:" in indent - - ctx = SessionContext() - stream = ctx.execute(plan, 0) - # get the one and only batch - batch = stream.next() - assert batch is not None - # there should be no more batches - batch = stream.next() - assert batch is None - - -def test_repartition(df): - df.repartition(2) - - -def test_repartition_by_hash(df): - df.repartition_by_hash(column("a"), num=2) - - -def test_intersect(): - ctx = SessionContext() - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df_a = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([3, 4, 5]), pa.array([6, 7, 8])], - names=["a", "b"], - ) - df_b = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([3]), pa.array([6])], - names=["a", "b"], - ) - df_c = ctx.create_dataframe([[batch]]).sort( - column("a").sort(ascending=True) - ) - - df_a_i_b = df_a.intersect(df_b).sort(column("a").sort(ascending=True)) - - assert df_c.collect() == df_a_i_b.collect() - - -def test_except_all(): - ctx = SessionContext() - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df_a = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([3, 4, 5]), pa.array([6, 7, 8])], - names=["a", "b"], - ) - df_b = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2]), pa.array([4, 5])], - names=["a", "b"], - ) - df_c = ctx.create_dataframe([[batch]]).sort( - column("a").sort(ascending=True) - ) - - df_a_e_b = df_a.except_all(df_b).sort(column("a").sort(ascending=True)) - - assert df_c.collect() == df_a_e_b.collect() - - -def test_collect_partitioned(): - ctx = SessionContext() - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - - assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned() - - -def test_union(ctx): - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df_a = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([3, 4, 5]), pa.array([6, 7, 8])], - names=["a", "b"], - ) - df_b = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])], - names=["a", "b"], - ) - df_c = ctx.create_dataframe([[batch]]).sort( - column("a").sort(ascending=True) - ) - - df_a_u_b = df_a.union(df_b).sort(column("a").sort(ascending=True)) - - assert df_c.collect() == df_a_u_b.collect() - - -def test_union_distinct(ctx): - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - df_a = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([3, 4, 5]), pa.array([6, 7, 8])], - names=["a", "b"], - ) - df_b = ctx.create_dataframe([[batch]]) - - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])], - names=["a", "b"], - ) - df_c = ctx.create_dataframe([[batch]]).sort( - column("a").sort(ascending=True) - ) - - df_a_u_b = df_a.union(df_b, True).sort(column("a").sort(ascending=True)) - - assert df_c.collect() == df_a_u_b.collect() - assert df_c.collect() == df_a_u_b.collect() - - -def test_cache(df): - assert df.cache().collect() == df.collect() - - -def test_count(df): - # Get number of rows - assert df.count() == 3 - - -def test_to_pandas(df): - # Skip test if pandas is not installed - pd = pytest.importorskip("pandas") - - # Convert datafusion dataframe to pandas dataframe - pandas_df = df.to_pandas() - assert type(pandas_df) == pd.DataFrame - assert pandas_df.shape == (3, 3) - assert set(pandas_df.columns) == {"a", "b", "c"} - - -def test_empty_to_pandas(df): - # Skip test if pandas is not installed - pd = pytest.importorskip("pandas") - - # Convert empty datafusion dataframe to pandas dataframe - pandas_df = df.limit(0).to_pandas() - assert type(pandas_df) == pd.DataFrame - assert pandas_df.shape == (0, 3) - assert set(pandas_df.columns) == {"a", "b", "c"} - - -def test_to_polars(df): - # Skip test if polars is not installed - pl = pytest.importorskip("polars") - - # Convert datafusion dataframe to polars dataframe - polars_df = df.to_polars() - assert type(polars_df) == pl.DataFrame - assert polars_df.shape == (3, 3) - assert set(polars_df.columns) == {"a", "b", "c"} - - -def test_empty_to_polars(df): - # Skip test if polars is not installed - pl = pytest.importorskip("polars") - - # Convert empty datafusion dataframe to polars dataframe - polars_df = df.limit(0).to_polars() - assert type(polars_df) == pl.DataFrame - assert polars_df.shape == (0, 3) - assert set(polars_df.columns) == {"a", "b", "c"} - - -def test_to_arrow_table(df): - # Convert datafusion dataframe to pyarrow Table - pyarrow_table = df.to_arrow_table() - assert type(pyarrow_table) == pa.Table - assert pyarrow_table.shape == (3, 3) - assert set(pyarrow_table.column_names) == {"a", "b", "c"} - - -def test_empty_to_arrow_table(df): - # Convert empty datafusion dataframe to pyarrow Table - pyarrow_table = df.limit(0).to_arrow_table() - assert type(pyarrow_table) == pa.Table - assert pyarrow_table.shape == (0, 3) - assert set(pyarrow_table.column_names) == {"a", "b", "c"} - - -def test_to_pylist(df): - # Convert datafusion dataframe to Python list - pylist = df.to_pylist() - assert type(pylist) == list - assert pylist == [ - {"a": 1, "b": 4, "c": 8}, - {"a": 2, "b": 5, "c": 5}, - {"a": 3, "b": 6, "c": 8}, - ] - - -def test_to_pydict(df): - # Convert datafusion dataframe to Python dictionary - pydict = df.to_pydict() - assert type(pydict) == dict - assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]} diff --git a/datafusion/tests/test_expr.py b/datafusion/tests/test_expr.py deleted file mode 100644 index 0c4869f27..000000000 --- a/datafusion/tests/test_expr.py +++ /dev/null @@ -1,110 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from datafusion import SessionContext -from datafusion.expr import Column, Literal, BinaryExpr, AggregateFunction -from datafusion.expr import ( - Projection, - Filter, - Aggregate, - Limit, - Sort, - TableScan, -) -import pytest - - -@pytest.fixture -def test_ctx(): - ctx = SessionContext() - ctx.register_csv("test", "testing/data/csv/aggregate_test_100.csv") - return ctx - - -def test_projection(test_ctx): - df = test_ctx.sql("select c1, 123, c1 < 123 from test") - plan = df.logical_plan() - - plan = plan.to_variant() - assert isinstance(plan, Projection) - - expr = plan.projections() - - col1 = expr[0].to_variant() - assert isinstance(col1, Column) - assert col1.name() == "c1" - assert col1.qualified_name() == "test.c1" - - col2 = expr[1].to_variant() - assert isinstance(col2, Literal) - assert col2.data_type() == "Int64" - assert col2.value_i64() == 123 - - col3 = expr[2].to_variant() - assert isinstance(col3, BinaryExpr) - assert isinstance(col3.left().to_variant(), Column) - assert col3.op() == "<" - assert isinstance(col3.right().to_variant(), Literal) - - plan = plan.input()[0].to_variant() - assert isinstance(plan, TableScan) - - -def test_filter(test_ctx): - df = test_ctx.sql("select c1 from test WHERE c1 > 5") - plan = df.logical_plan() - - plan = plan.to_variant() - assert isinstance(plan, Projection) - - plan = plan.input()[0].to_variant() - assert isinstance(plan, Filter) - - -def test_limit(test_ctx): - df = test_ctx.sql("select c1 from test LIMIT 10") - plan = df.logical_plan() - - plan = plan.to_variant() - assert isinstance(plan, Limit) - - -def test_aggregate_query(test_ctx): - df = test_ctx.sql("select c1, count(*) from test group by c1") - plan = df.logical_plan() - - projection = plan.to_variant() - assert isinstance(projection, Projection) - - aggregate = projection.input()[0].to_variant() - assert isinstance(aggregate, Aggregate) - - col1 = aggregate.group_by_exprs()[0].to_variant() - assert isinstance(col1, Column) - assert col1.name() == "c1" - assert col1.qualified_name() == "test.c1" - - col2 = aggregate.aggregate_exprs()[0].to_variant() - assert isinstance(col2, AggregateFunction) - - -def test_sort(test_ctx): - df = test_ctx.sql("select c1 from test order by c1") - plan = df.logical_plan() - - plan = plan.to_variant() - assert isinstance(plan, Sort) diff --git a/datafusion/tests/test_functions.py b/datafusion/tests/test_functions.py deleted file mode 100644 index bea580859..000000000 --- a/datafusion/tests/test_functions.py +++ /dev/null @@ -1,413 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -import pyarrow as pa -import pytest -from datetime import datetime - -from datafusion import SessionContext, column -from datafusion import functions as f -from datafusion import literal - - -@pytest.fixture -def df(): - ctx = SessionContext() - # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays( - [ - pa.array(["Hello", "World", "!"]), - pa.array([4, 5, 6]), - pa.array(["hello ", " world ", " !"]), - pa.array( - [ - datetime(2022, 12, 31), - datetime(2027, 6, 26), - datetime(2020, 7, 2), - ] - ), - ], - names=["a", "b", "c", "d"], - ) - return ctx.create_dataframe([[batch]]) - - -def test_literal(df): - df = df.select( - literal(1), - literal("1"), - literal("OK"), - literal(3.14), - literal(True), - literal(b"hello world"), - ) - result = df.collect() - assert len(result) == 1 - result = result[0] - assert result.column(0) == pa.array([1] * 3) - assert result.column(1) == pa.array(["1"] * 3) - assert result.column(2) == pa.array(["OK"] * 3) - assert result.column(3) == pa.array([3.14] * 3) - assert result.column(4) == pa.array([True] * 3) - assert result.column(5) == pa.array([b"hello world"] * 3) - - -def test_lit_arith(df): - """ - Test literals with arithmetic operations - """ - df = df.select( - literal(1) + column("b"), f.concat(column("a"), literal("!")) - ) - result = df.collect() - assert len(result) == 1 - result = result[0] - assert result.column(0) == pa.array([5, 6, 7]) - assert result.column(1) == pa.array(["Hello!", "World!", "!!"]) - - -def test_math_functions(): - ctx = SessionContext() - # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays( - [pa.array([0.1, -0.7, 0.55])], names=["value"] - ) - df = ctx.create_dataframe([[batch]]) - - values = np.array([0.1, -0.7, 0.55]) - col_v = column("value") - df = df.select( - f.abs(col_v), - f.sin(col_v), - f.cos(col_v), - f.tan(col_v), - f.asin(col_v), - f.acos(col_v), - f.exp(col_v), - f.ln(col_v + literal(pa.scalar(1))), - f.log2(col_v + literal(pa.scalar(1))), - f.log10(col_v + literal(pa.scalar(1))), - f.random(), - f.atan(col_v), - f.atan2(col_v, literal(pa.scalar(1.1))), - f.ceil(col_v), - f.floor(col_v), - f.power(col_v, literal(pa.scalar(3))), - f.pow(col_v, literal(pa.scalar(4))), - f.round(col_v), - f.sqrt(col_v), - f.signum(col_v), - f.trunc(col_v), - ) - batches = df.collect() - assert len(batches) == 1 - result = batches[0] - - np.testing.assert_array_almost_equal(result.column(0), np.abs(values)) - np.testing.assert_array_almost_equal(result.column(1), np.sin(values)) - np.testing.assert_array_almost_equal(result.column(2), np.cos(values)) - np.testing.assert_array_almost_equal(result.column(3), np.tan(values)) - np.testing.assert_array_almost_equal(result.column(4), np.arcsin(values)) - np.testing.assert_array_almost_equal(result.column(5), np.arccos(values)) - np.testing.assert_array_almost_equal(result.column(6), np.exp(values)) - np.testing.assert_array_almost_equal( - result.column(7), np.log(values + 1.0) - ) - np.testing.assert_array_almost_equal( - result.column(8), np.log2(values + 1.0) - ) - np.testing.assert_array_almost_equal( - result.column(9), np.log10(values + 1.0) - ) - np.testing.assert_array_less(result.column(10), np.ones_like(values)) - np.testing.assert_array_almost_equal(result.column(11), np.arctan(values)) - np.testing.assert_array_almost_equal( - result.column(12), np.arctan2(values, 1.1) - ) - np.testing.assert_array_almost_equal(result.column(13), np.ceil(values)) - np.testing.assert_array_almost_equal(result.column(14), np.floor(values)) - np.testing.assert_array_almost_equal( - result.column(15), np.power(values, 3) - ) - np.testing.assert_array_almost_equal( - result.column(16), np.power(values, 4) - ) - np.testing.assert_array_almost_equal(result.column(17), np.round(values)) - np.testing.assert_array_almost_equal(result.column(18), np.sqrt(values)) - np.testing.assert_array_almost_equal(result.column(19), np.sign(values)) - np.testing.assert_array_almost_equal(result.column(20), np.trunc(values)) - - -def test_string_functions(df): - df = df.select( - f.ascii(column("a")), - f.bit_length(column("a")), - f.btrim(literal(" World ")), - f.character_length(column("a")), - f.chr(literal(68)), - f.concat_ws("-", column("a"), literal("test")), - f.concat(column("a"), literal("?")), - f.initcap(column("c")), - f.left(column("a"), literal(3)), - f.length(column("c")), - f.lower(column("a")), - f.lpad(column("a"), literal(7)), - f.ltrim(column("c")), - f.md5(column("a")), - f.octet_length(column("a")), - f.repeat(column("a"), literal(2)), - f.replace(column("a"), literal("l"), literal("?")), - f.reverse(column("a")), - f.right(column("a"), literal(4)), - f.rpad(column("a"), literal(8)), - f.rtrim(column("c")), - f.split_part(column("a"), literal("l"), literal(1)), - f.starts_with(column("a"), literal("Wor")), - f.strpos(column("a"), literal("o")), - f.substr(column("a"), literal(3)), - f.translate(column("a"), literal("or"), literal("ld")), - f.trim(column("c")), - f.upper(column("c")), - ) - result = df.collect() - assert len(result) == 1 - result = result[0] - assert result.column(0) == pa.array( - [72, 87, 33], type=pa.int32() - ) # H = 72; W = 87; ! = 33 - assert result.column(1) == pa.array([40, 40, 8], type=pa.int32()) - assert result.column(2) == pa.array(["World", "World", "World"]) - assert result.column(3) == pa.array([5, 5, 1], type=pa.int32()) - assert result.column(4) == pa.array(["D", "D", "D"]) - assert result.column(5) == pa.array(["Hello-test", "World-test", "!-test"]) - assert result.column(6) == pa.array(["Hello?", "World?", "!?"]) - assert result.column(7) == pa.array(["Hello ", " World ", " !"]) - assert result.column(8) == pa.array(["Hel", "Wor", "!"]) - assert result.column(9) == pa.array([6, 7, 2], type=pa.int32()) - assert result.column(10) == pa.array(["hello", "world", "!"]) - assert result.column(11) == pa.array([" Hello", " World", " !"]) - assert result.column(12) == pa.array(["hello ", "world ", "!"]) - assert result.column(13) == pa.array( - [ - "8b1a9953c4611296a827abf8c47804d7", - "f5a7924e621e84c9280a9a27e1bcb7f6", - "9033e0e305f247c0c3c80d0c7848c8b3", - ] - ) - assert result.column(14) == pa.array([5, 5, 1], type=pa.int32()) - assert result.column(15) == pa.array(["HelloHello", "WorldWorld", "!!"]) - assert result.column(16) == pa.array(["He??o", "Wor?d", "!"]) - assert result.column(17) == pa.array(["olleH", "dlroW", "!"]) - assert result.column(18) == pa.array(["ello", "orld", "!"]) - assert result.column(19) == pa.array(["Hello ", "World ", "! "]) - assert result.column(20) == pa.array(["hello", " world", " !"]) - assert result.column(21) == pa.array(["He", "Wor", "!"]) - assert result.column(22) == pa.array([False, True, False]) - assert result.column(23) == pa.array([5, 2, 0], type=pa.int32()) - assert result.column(24) == pa.array(["llo", "rld", ""]) - assert result.column(25) == pa.array(["Helll", "Wldld", "!"]) - assert result.column(26) == pa.array(["hello", "world", "!"]) - assert result.column(27) == pa.array(["HELLO ", " WORLD ", " !"]) - - -def test_hash_functions(df): - exprs = [ - f.digest(column("a"), literal(m)) - for m in ( - "md5", - "sha224", - "sha256", - "sha384", - "sha512", - "blake2s", - "blake3", - ) - ] - df = df.select( - *exprs, - f.sha224(column("a")), - f.sha256(column("a")), - f.sha384(column("a")), - f.sha512(column("a")), - ) - result = df.collect() - assert len(result) == 1 - result = result[0] - b = bytearray.fromhex - assert result.column(0) == pa.array( - [ - b("8B1A9953C4611296A827ABF8C47804D7"), - b("F5A7924E621E84C9280A9A27E1BCB7F6"), - b("9033E0E305F247C0C3C80D0C7848C8B3"), - ] - ) - assert result.column(1) == pa.array( - [ - b("4149DA18AA8BFC2B1E382C6C26556D01A92C261B6436DAD5E3BE3FCC"), - b("12972632B6D3B6AA52BD6434552F08C1303D56B817119406466E9236"), - b("6641A7E8278BCD49E476E7ACAE158F4105B2952D22AEB2E0B9A231A0"), - ] - ) - assert result.column(2) == pa.array( - [ - b( - "185F8DB32271FE25F561A6FC938B2E26" - "4306EC304EDA518007D1764826381969" - ), - b( - "78AE647DC5544D227130A0682A51E30B" - "C7777FBB6D8A8F17007463A3ECD1D524" - ), - b( - "BB7208BC9B5D7C04F1236A82A0093A5E" - "33F40423D5BA8D4266F7092C3BA43B62" - ), - ] - ) - assert result.column(3) == pa.array( - [ - b( - "3519FE5AD2C596EFE3E276A6F351B8FC" - "0B03DB861782490D45F7598EBD0AB5FD" - "5520ED102F38C4A5EC834E98668035FC" - ), - b( - "ED7CED84875773603AF90402E42C65F3" - "B48A5E77F84ADC7A19E8F3E8D3101010" - "22F552AEC70E9E1087B225930C1D260A" - ), - b( - "1D0EC8C84EE9521E21F06774DE232367" - "B64DE628474CB5B2E372B699A1F55AE3" - "35CC37193EF823E33324DFD9A70738A6" - ), - ] - ) - assert result.column(4) == pa.array( - [ - b( - "3615F80C9D293ED7402687F94B22D58E" - "529B8CC7916F8FAC7FDDF7FBD5AF4CF7" - "77D3D795A7A00A16BF7E7F3FB9561EE9" - "BAAE480DA9FE7A18769E71886B03F315" - ), - b( - "8EA77393A42AB8FA92500FB077A9509C" - "C32BC95E72712EFA116EDAF2EDFAE34F" - "BB682EFDD6C5DD13C117E08BD4AAEF71" - "291D8AACE2F890273081D0677C16DF0F" - ), - b( - "3831A6A6155E509DEE59A7F451EB3532" - "4D8F8F2DF6E3708894740F98FDEE2388" - "9F4DE5ADB0C5010DFB555CDA77C8AB5D" - "C902094C52DE3278F35A75EBC25F093A" - ), - ] - ) - assert result.column(5) == pa.array( - [ - b( - "F73A5FBF881F89B814871F46E26AD3FA" - "37CB2921C5E8561618639015B3CCBB71" - ), - b( - "B792A0383FB9E7A189EC150686579532" - "854E44B71AC394831DAED169BA85CCC5" - ), - b( - "27988A0E51812297C77A433F63523334" - "6AEE29A829DCF4F46E0F58F402C6CFCB" - ), - ] - ) - assert result.column(6) == pa.array( - [ - b( - "FBC2B0516EE8744D293B980779178A35" - "08850FDCFE965985782C39601B65794F" - ), - b( - "BF73D18575A736E4037D45F9E316085B" - "86C19BE6363DE6AA789E13DEAACC1C4E" - ), - b( - "C8D11B9F7237E4034ADBCD2005735F9B" - "C4C597C75AD89F4492BEC8F77D15F7EB" - ), - ] - ) - assert result.column(7) == result.column(1) # SHA-224 - assert result.column(8) == result.column(2) # SHA-256 - assert result.column(9) == result.column(3) # SHA-384 - assert result.column(10) == result.column(4) # SHA-512 - - -def test_temporal_functions(df): - df = df.select( - f.date_part(literal("month"), column("d")), - f.datepart(literal("year"), column("d")), - f.date_trunc(literal("month"), column("d")), - f.datetrunc(literal("day"), column("d")), - f.date_bin( - literal("15 minutes"), - column("d"), - literal("2001-01-01 00:02:30"), - ), - f.from_unixtime(literal(1673383974)), - f.to_timestamp(literal("2023-09-07 05:06:14.523952")), - f.to_timestamp_seconds(literal("2023-09-07 05:06:14.523952")), - f.to_timestamp_millis(literal("2023-09-07 05:06:14.523952")), - f.to_timestamp_micros(literal("2023-09-07 05:06:14.523952")), - ) - result = df.collect() - assert len(result) == 1 - result = result[0] - assert result.column(0) == pa.array([12, 6, 7], type=pa.float64()) - assert result.column(1) == pa.array([2022, 2027, 2020], type=pa.float64()) - assert result.column(2) == pa.array( - [datetime(2022, 12, 1), datetime(2027, 6, 1), datetime(2020, 7, 1)], - type=pa.timestamp("ns"), - ) - assert result.column(3) == pa.array( - [datetime(2022, 12, 31), datetime(2027, 6, 26), datetime(2020, 7, 2)], - type=pa.timestamp("ns"), - ) - assert result.column(4) == pa.array( - [ - datetime(2022, 12, 30, 23, 47, 30), - datetime(2027, 6, 25, 23, 47, 30), - datetime(2020, 7, 1, 23, 47, 30), - ], - type=pa.timestamp("ns"), - ) - assert result.column(5) == pa.array( - [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s") - ) - assert result.column(6) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns") - ) - assert result.column(7) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s") - ) - assert result.column(8) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms") - ) - assert result.column(9) == pa.array( - [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us") - ) diff --git a/datafusion/tests/test_sql.py b/datafusion/tests/test_sql.py deleted file mode 100644 index 638a222dc..000000000 --- a/datafusion/tests/test_sql.py +++ /dev/null @@ -1,295 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -import pyarrow as pa -import pyarrow.dataset as ds -import pytest - -from datafusion import udf - -from . import generic as helpers - - -def test_no_table(ctx): - with pytest.raises(Exception, match="DataFusion error"): - ctx.sql("SELECT a FROM b").collect() - - -def test_register_csv(ctx, tmp_path): - path = tmp_path / "test.csv" - - table = pa.Table.from_arrays( - [ - [1, 2, 3, 4], - ["a", "b", "c", "d"], - [1.1, 2.2, 3.3, 4.4], - ], - names=["int", "str", "float"], - ) - pa.csv.write_csv(table, path) - - ctx.register_csv("csv", path) - ctx.register_csv("csv1", str(path)) - ctx.register_csv( - "csv2", - path, - has_header=True, - delimiter=",", - schema_infer_max_records=10, - ) - alternative_schema = pa.schema( - [ - ("some_int", pa.int16()), - ("some_bytes", pa.string()), - ("some_floats", pa.float32()), - ] - ) - ctx.register_csv("csv3", path, schema=alternative_schema) - - assert ctx.tables() == {"csv", "csv1", "csv2", "csv3"} - - for table in ["csv", "csv1", "csv2"]: - result = ctx.sql(f"SELECT COUNT(int) AS cnt FROM {table}").collect() - result = pa.Table.from_batches(result) - assert result.to_pydict() == {"cnt": [4]} - - result = ctx.sql("SELECT * FROM csv3").collect() - result = pa.Table.from_batches(result) - assert result.schema == alternative_schema - - with pytest.raises( - ValueError, match="Delimiter must be a single character" - ): - ctx.register_csv("csv4", path, delimiter="wrong") - - -def test_register_parquet(ctx, tmp_path): - path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) - ctx.register_parquet("t", path) - assert ctx.tables() == {"t"} - - result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() - result = pa.Table.from_batches(result) - assert result.to_pydict() == {"cnt": [100]} - - -def test_register_parquet_partitioned(ctx, tmp_path): - dir_root = tmp_path / "dataset_parquet_partitioned" - dir_root.mkdir(exist_ok=False) - (dir_root / "grp=a").mkdir(exist_ok=False) - (dir_root / "grp=b").mkdir(exist_ok=False) - - table = pa.Table.from_arrays( - [ - [1, 2, 3, 4], - ["a", "b", "c", "d"], - [1.1, 2.2, 3.3, 4.4], - ], - names=["int", "str", "float"], - ) - pa.parquet.write_table(table.slice(0, 3), dir_root / "grp=a/file.parquet") - pa.parquet.write_table(table.slice(3, 4), dir_root / "grp=b/file.parquet") - - ctx.register_parquet( - "datapp", - str(dir_root), - table_partition_cols=[("grp", "string")], - parquet_pruning=True, - file_extension=".parquet", - ) - assert ctx.tables() == {"datapp"} - - result = ctx.sql( - "SELECT grp, COUNT(*) AS cnt FROM datapp GROUP BY grp" - ).collect() - result = pa.Table.from_batches(result) - - rd = result.to_pydict() - assert dict(zip(rd["grp"], rd["cnt"])) == {"a": 3, "b": 1} - - -def test_register_dataset(ctx, tmp_path): - path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) - dataset = ds.dataset(path, format="parquet") - - ctx.register_dataset("t", dataset) - assert ctx.tables() == {"t"} - - result = ctx.sql("SELECT COUNT(a) AS cnt FROM t").collect() - result = pa.Table.from_batches(result) - assert result.to_pydict() == {"cnt": [100]} - - -def test_execute(ctx, tmp_path): - data = [1, 1, 2, 2, 3, 11, 12] - - # single column, "a" - path = helpers.write_parquet(tmp_path / "a.parquet", pa.array(data)) - ctx.register_parquet("t", path) - - assert ctx.tables() == {"t"} - - # count - result = ctx.sql( - "SELECT COUNT(a) AS cnt FROM t WHERE a IS NOT NULL" - ).collect() - - expected = pa.array([7], pa.int64()) - expected = [pa.RecordBatch.from_arrays([expected], ["cnt"])] - assert result == expected - - # where - expected = pa.array([2], pa.int64()) - expected = [pa.RecordBatch.from_arrays([expected], ["cnt"])] - result = ctx.sql("SELECT COUNT(a) AS cnt FROM t WHERE a > 10").collect() - assert result == expected - - # group by - results = ctx.sql( - "SELECT CAST(a as int) AS a, COUNT(a) AS cnt FROM t GROUP BY a" - ).collect() - - # group by returns batches - result_keys = [] - result_values = [] - for result in results: - pydict = result.to_pydict() - result_keys.extend(pydict["a"]) - result_values.extend(pydict["cnt"]) - - result_keys, result_values = ( - list(t) for t in zip(*sorted(zip(result_keys, result_values))) - ) - - assert result_keys == [1, 2, 3, 11, 12] - assert result_values == [2, 2, 1, 1, 1] - - # order by - result = ctx.sql( - "SELECT a, CAST(a AS int) AS a_int FROM t ORDER BY a DESC LIMIT 2" - ).collect() - expected_a = pa.array([50.0219, 50.0152], pa.float64()) - expected_cast = pa.array([50, 50], pa.int32()) - expected = [ - pa.RecordBatch.from_arrays([expected_a, expected_cast], ["a", "a_int"]) - ] - np.testing.assert_equal(expected[0].column(1), expected[0].column(1)) - - -def test_cast(ctx, tmp_path): - """ - Verify that we can cast - """ - path = helpers.write_parquet(tmp_path / "a.parquet", helpers.data()) - ctx.register_parquet("t", path) - - valid_types = [ - "smallint", - "int", - "bigint", - "float(32)", - "float(64)", - "float", - ] - - select = ", ".join( - [f"CAST(9 AS {t}) AS A{i}" for i, t in enumerate(valid_types)] - ) - - # can execute, which implies that we can cast - ctx.sql(f"SELECT {select} FROM t").collect() - - -@pytest.mark.parametrize( - ("fn", "input_types", "output_type", "input_values", "expected_values"), - [ - ( - lambda x: x, - [pa.float64()], - pa.float64(), - [-1.2, None, 1.2], - [-1.2, None, 1.2], - ), - ( - lambda x: x.is_null(), - [pa.float64()], - pa.bool_(), - [-1.2, None, 1.2], - [False, True, False], - ), - ], -) -def test_udf( - ctx, tmp_path, fn, input_types, output_type, input_values, expected_values -): - # write to disk - path = helpers.write_parquet( - tmp_path / "a.parquet", pa.array(input_values) - ) - ctx.register_parquet("t", path) - - func = udf( - fn, input_types, output_type, name="func", volatility="immutable" - ) - ctx.register_udf(func) - - batches = ctx.sql("SELECT func(a) AS tt FROM t").collect() - result = batches[0].column(0) - - assert result == pa.array(expected_values) - - -_null_mask = np.array([False, True, False]) - - -@pytest.mark.parametrize( - "arr", - [ - pa.array(["a", "b", "c"], pa.utf8(), _null_mask), - pa.array(["a", "b", "c"], pa.large_utf8(), _null_mask), - pa.array([b"1", b"2", b"3"], pa.binary(), _null_mask), - pa.array([b"1111", b"2222", b"3333"], pa.large_binary(), _null_mask), - pa.array([False, True, True], None, _null_mask), - pa.array([0, 1, 2], None), - helpers.data_binary_other(), - helpers.data_date32(), - helpers.data_with_nans(), - # C data interface missing - pytest.param( - pa.array([b"1111", b"2222", b"3333"], pa.binary(4), _null_mask), - marks=pytest.mark.xfail, - ), - pytest.param(helpers.data_datetime("s"), marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("ms"), marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("us"), marks=pytest.mark.xfail), - pytest.param(helpers.data_datetime("ns"), marks=pytest.mark.xfail), - # Not writtable to parquet - pytest.param(helpers.data_timedelta("s"), marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("ms"), marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("us"), marks=pytest.mark.xfail), - pytest.param(helpers.data_timedelta("ns"), marks=pytest.mark.xfail), - ], -) -def test_simple_select(ctx, tmp_path, arr): - path = helpers.write_parquet(tmp_path / "a.parquet", arr) - ctx.register_parquet("t", path) - - batches = ctx.sql("SELECT a AS tt FROM t").collect() - result = batches[0].column(0) - - np.testing.assert_equal(result, arr) diff --git a/datafusion/tests/test_substrait.py b/datafusion/tests/test_substrait.py deleted file mode 100644 index 9a08b760e..000000000 --- a/datafusion/tests/test_substrait.py +++ /dev/null @@ -1,55 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import pyarrow as pa - -from datafusion import SessionContext -from datafusion import substrait as ss -import pytest - - -@pytest.fixture -def ctx(): - return SessionContext() - - -def test_substrait_serialization(ctx): - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"], - ) - - ctx.register_record_batches("t", [[batch]]) - - assert ctx.tables() == {"t"} - - # For now just make sure the method calls blow up - substrait_plan = ss.substrait.serde.serialize_to_plan( - "SELECT * FROM t", ctx - ) - substrait_bytes = ss.substrait.serde.serialize_bytes( - "SELECT * FROM t", ctx - ) - substrait_plan = ss.substrait.serde.deserialize_bytes(substrait_bytes) - logical_plan = ss.substrait.consumer.from_substrait_plan( - ctx, substrait_plan - ) - - # demonstrate how to create a DataFrame from a deserialized logical plan - df = ctx.create_dataframe_from_logical_plan(logical_plan) - - substrait_plan = ss.substrait.producer.to_substrait_plan(df.logical_plan()) diff --git a/datafusion/tests/test_udaf.py b/datafusion/tests/test_udaf.py deleted file mode 100644 index c2b29d199..000000000 --- a/datafusion/tests/test_udaf.py +++ /dev/null @@ -1,136 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from typing import List - -import pyarrow as pa -import pyarrow.compute as pc -import pytest - -from datafusion import Accumulator, SessionContext, column, udaf - - -class Summarize(Accumulator): - """ - Interface of a user-defined accumulation. - """ - - def __init__(self): - self._sum = pa.scalar(0.0) - - def state(self) -> List[pa.Scalar]: - return [self._sum] - - def update(self, values: pa.Array) -> None: - # Not nice since pyarrow scalars can't be summed yet. - # This breaks on `None` - self._sum = pa.scalar(self._sum.as_py() + pc.sum(values).as_py()) - - def merge(self, states: pa.Array) -> None: - # Not nice since pyarrow scalars can't be summed yet. - # This breaks on `None` - self._sum = pa.scalar(self._sum.as_py() + pc.sum(states).as_py()) - - def evaluate(self) -> pa.Scalar: - return self._sum - - -class NotSubclassOfAccumulator: - pass - - -class MissingMethods(Accumulator): - def __init__(self): - self._sum = pa.scalar(0) - - def state(self) -> List[pa.Scalar]: - return [self._sum] - - -@pytest.fixture -def df(): - ctx = SessionContext() - - # create a RecordBatch and a new DataFrame from it - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 4, 6])], - names=["a", "b"], - ) - return ctx.create_dataframe([[batch]]) - - -@pytest.mark.skip(reason="df.collect() will hang, need more investigations") -def test_errors(df): - with pytest.raises(TypeError): - udaf( - NotSubclassOfAccumulator, - pa.float64(), - pa.float64(), - [pa.float64()], - volatility="immutable", - ) - - accum = udaf( - MissingMethods, - pa.int64(), - pa.int64(), - [pa.int64()], - volatility="immutable", - ) - df = df.aggregate([], [accum(column("a"))]) - - msg = ( - "Can't instantiate abstract class MissingMethods with abstract " - "methods evaluate, merge, update" - ) - with pytest.raises(Exception, match=msg): - df.collect() - - -def test_aggregate(df): - summarize = udaf( - Summarize, - pa.float64(), - pa.float64(), - [pa.float64()], - volatility="immutable", - ) - - df = df.aggregate([], [summarize(column("a"))]) - - # execute and collect the first (and only) batch - result = df.collect()[0] - - assert result.column(0) == pa.array([1.0 + 2.0 + 3.0]) - - -def test_group_by(df): - summarize = udaf( - Summarize, - pa.float64(), - pa.float64(), - [pa.float64()], - volatility="immutable", - ) - - df = df.aggregate([column("b")], [summarize(column("a"))]) - - batches = df.collect() - - arrays = [batch.column(1) for batch in batches] - joined = pa.concat_arrays(arrays) - assert joined == pa.array([1.0 + 2.0, 3.0]) diff --git a/dev/changelog/43.0.0.md b/dev/changelog/43.0.0.md new file mode 100644 index 000000000..bbb766910 --- /dev/null +++ b/dev/changelog/43.0.0.md @@ -0,0 +1,73 @@ + + +# Apache DataFusion Python 43.0.0 Changelog + +This release consists of 26 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: expose `drop` method [#913](https://github.com/apache/datafusion-python/pull/913) (ion-elgreco) +- feat: expose `join_on` [#914](https://github.com/apache/datafusion-python/pull/914) (ion-elgreco) +- feat: add fill_null/nan expressions [#919](https://github.com/apache/datafusion-python/pull/919) (ion-elgreco) +- feat: add `with_columns` [#909](https://github.com/apache/datafusion-python/pull/909) (ion-elgreco) +- feat: add `cast` to DataFrame [#916](https://github.com/apache/datafusion-python/pull/916) (ion-elgreco) +- feat: add `head`, `tail` methods [#915](https://github.com/apache/datafusion-python/pull/915) (ion-elgreco) + +**Fixed bugs:** + +- fix: remove use of deprecated `make_scalar_function` [#906](https://github.com/apache/datafusion-python/pull/906) (Michael-J-Ward) +- fix: udwf example [#948](https://github.com/apache/datafusion-python/pull/948) (mesejo) + +**Other:** + +- Ts/minor updates release process [#903](https://github.com/apache/datafusion-python/pull/903) (timsaucer) +- build(deps): bump pyo3 from 0.22.3 to 0.22.4 [#910](https://github.com/apache/datafusion-python/pull/910) (dependabot[bot]) +- refactor: `from_arrow` use protocol typehints [#917](https://github.com/apache/datafusion-python/pull/917) (ion-elgreco) +- Change requires-python version in pyproject.toml [#924](https://github.com/apache/datafusion-python/pull/924) (kosiew) +- chore: deprecate `select_columns` [#911](https://github.com/apache/datafusion-python/pull/911) (ion-elgreco) +- build(deps): bump uuid from 1.10.0 to 1.11.0 [#927](https://github.com/apache/datafusion-python/pull/927) (dependabot[bot]) +- Add array_empty scalar function [#931](https://github.com/apache/datafusion-python/pull/931) (kosiew) +- add `cardinality` function to calculate total distinct elements in an array [#937](https://github.com/apache/datafusion-python/pull/937) (kosiew) +- Add empty scalar function (alias of array_empty), fix a small typo [#938](https://github.com/apache/datafusion-python/pull/938) (kosiew) +- README How to develop section now also works on Apple M1 [#940](https://github.com/apache/datafusion-python/pull/940) (drauschenbach) +- refactor: dataframe `join` params [#912](https://github.com/apache/datafusion-python/pull/912) (ion-elgreco) +- Upgrade to Datafusion 43 [#905](https://github.com/apache/datafusion-python/pull/905) (Michael-J-Ward) +- build(deps): bump tokio from 1.40.0 to 1.41.1 [#946](https://github.com/apache/datafusion-python/pull/946) (dependabot[bot]) +- Add list_cat, list_concat, list_repeat [#942](https://github.com/apache/datafusion-python/pull/942) (kosiew) +- Add foreign table providers [#921](https://github.com/apache/datafusion-python/pull/921) (timsaucer) +- Add make_list and tests for make_list, make_array [#949](https://github.com/apache/datafusion-python/pull/949) (kosiew) +- Documentation updates: simplify examples and add section on data sources [#955](https://github.com/apache/datafusion-python/pull/955) (timsaucer) +- Add datafusion.extract [#959](https://github.com/apache/datafusion-python/pull/959) (kosiew) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 9 Ion Koutsouris + 7 kosiew + 3 Tim Saucer + 3 dependabot[bot] + 2 Michael J Ward + 1 Daniel Mesejo + 1 David Rauschenbach +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/dev/changelog/44.0.0.md b/dev/changelog/44.0.0.md new file mode 100644 index 000000000..c5ed4bdb0 --- /dev/null +++ b/dev/changelog/44.0.0.md @@ -0,0 +1,58 @@ + + +# Apache DataFusion Python 44.0.0 Changelog + +This release consists of 12 commits from 5 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: support enable_url_table config [#980](https://github.com/apache/datafusion-python/pull/980) (chenkovsky) +- feat: remove DataFusion pyarrow feat [#1000](https://github.com/apache/datafusion-python/pull/1000) (timsaucer) + +**Fixed bugs:** + +- fix: correct LZ0 to LZO in compression options [#995](https://github.com/apache/datafusion-python/pull/995) (kosiew) + +**Other:** + +- Add arrow cast [#962](https://github.com/apache/datafusion-python/pull/962) (kosiew) +- Fix small issues in pyproject.toml [#976](https://github.com/apache/datafusion-python/pull/976) (kylebarron) +- chore: set validation and type hint for ffi tableprovider [#983](https://github.com/apache/datafusion-python/pull/983) (ion-elgreco) +- Support async iteration of RecordBatchStream [#975](https://github.com/apache/datafusion-python/pull/975) (kylebarron) +- Chore/upgrade datafusion 44 [#973](https://github.com/apache/datafusion-python/pull/973) (timsaucer) +- Default to ZSTD compression when writing Parquet [#981](https://github.com/apache/datafusion-python/pull/981) (kosiew) +- Feat/use uv python management [#994](https://github.com/apache/datafusion-python/pull/994) (timsaucer) +- minor: Update dependencies prior to release [#999](https://github.com/apache/datafusion-python/pull/999) (timsaucer) +- Apply import ordering in ruff check [#1001](https://github.com/apache/datafusion-python/pull/1001) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 5 Tim Saucer + 3 kosiew + 2 Kyle Barron + 1 Chongchen Chen + 1 Ion Koutsouris +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/45.0.0.md b/dev/changelog/45.0.0.md new file mode 100644 index 000000000..93659b171 --- /dev/null +++ b/dev/changelog/45.0.0.md @@ -0,0 +1,42 @@ + + +# Apache DataFusion Python 45.0.0 Changelog + +This release consists of 2 commits from 2 contributors. See credits at the end of this changelog for more information. + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) + +**Other:** + +- Chore/upgrade datafusion 45 [#1010](https://github.com/apache/datafusion-python/pull/1010) (kevinjqliu) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 1 Kevin Liu + 1 Tim Saucer +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/46.0.0.md b/dev/changelog/46.0.0.md new file mode 100644 index 000000000..3e5768099 --- /dev/null +++ b/dev/changelog/46.0.0.md @@ -0,0 +1,73 @@ + + +# Apache DataFusion Python 46.0.0 Changelog + +This release consists of 21 commits from 11 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: reads using global ctx [#982](https://github.com/apache/datafusion-python/pull/982) (ion-elgreco) +- feat: Implementation of udf and udaf decorator [#1040](https://github.com/apache/datafusion-python/pull/1040) (CrystalZhou0529) +- feat: expose regex_count function [#1066](https://github.com/apache/datafusion-python/pull/1066) (nirnayroy) +- feat: Update DataFusion dependency to 46 [#1079](https://github.com/apache/datafusion-python/pull/1079) (timsaucer) + +**Fixed bugs:** + +- fix: add to_timestamp_nanos [#1020](https://github.com/apache/datafusion-python/pull/1020) (chenkovsky) +- fix: type checking [#993](https://github.com/apache/datafusion-python/pull/993) (chenkovsky) + +**Other:** + +- [infra] Fail Clippy on rust build warnings [#1029](https://github.com/apache/datafusion-python/pull/1029) (kevinjqliu) +- Add user documentation for the FFI approach [#1031](https://github.com/apache/datafusion-python/pull/1031) (timsaucer) +- build(deps): bump arrow from 54.1.0 to 54.2.0 [#1035](https://github.com/apache/datafusion-python/pull/1035) (dependabot[bot]) +- Chore: Release datafusion-python 45 [#1024](https://github.com/apache/datafusion-python/pull/1024) (timsaucer) +- Enable Dataframe to be converted into views which can be used in register_table [#1016](https://github.com/apache/datafusion-python/pull/1016) (kosiew) +- Add ruff check for missing futures import [#1052](https://github.com/apache/datafusion-python/pull/1052) (timsaucer) +- Enable take comments to assign issues to users [#1058](https://github.com/apache/datafusion-python/pull/1058) (timsaucer) +- Update python min version to 3.9 [#1043](https://github.com/apache/datafusion-python/pull/1043) (kevinjqliu) +- feat/improve ruff test coverage [#1055](https://github.com/apache/datafusion-python/pull/1055) (timsaucer) +- feat/making global context accessible for users [#1060](https://github.com/apache/datafusion-python/pull/1060) (jsai28) +- Renaming Internal Structs [#1059](https://github.com/apache/datafusion-python/pull/1059) (Spaarsh) +- test: add pytest asyncio tests [#1063](https://github.com/apache/datafusion-python/pull/1063) (jsai28) +- Add decorator for udwf [#1061](https://github.com/apache/datafusion-python/pull/1061) (kosiew) +- Add additional ruff suggestions [#1062](https://github.com/apache/datafusion-python/pull/1062) (Spaarsh) +- Improve collection during repr and repr_html [#1036](https://github.com/apache/datafusion-python/pull/1036) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Tim Saucer + 2 Kevin Liu + 2 Spaarsh + 2 jsai28 + 2 kosiew + 1 Chen Chongchen + 1 Chongchen Chen + 1 Crystal Zhou + 1 Ion Koutsouris + 1 Nirnay Roy + 1 dependabot[bot] +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/47.0.0.md b/dev/changelog/47.0.0.md new file mode 100644 index 000000000..a7ed90313 --- /dev/null +++ b/dev/changelog/47.0.0.md @@ -0,0 +1,64 @@ + + +# Apache DataFusion Python 47.0.0 Changelog + +This release consists of 23 commits from 5 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: support unparser [#1088](https://github.com/apache/datafusion-python/pull/1088) (chenkovsky) +- feat: update datafusion dependency 47 [#1107](https://github.com/apache/datafusion-python/pull/1107) (timsaucer) +- feat: alias with metadata [#1111](https://github.com/apache/datafusion-python/pull/1111) (chenkovsky) +- feat: add missing PyLogicalPlan to_variant [#1085](https://github.com/apache/datafusion-python/pull/1085) (chenkovsky) +- feat: add user defined table function support [#1113](https://github.com/apache/datafusion-python/pull/1113) (timsaucer) + +**Fixed bugs:** + +- fix: recursive import [#1117](https://github.com/apache/datafusion-python/pull/1117) (chenkovsky) + +**Other:** + +- Update changelog and version number [#1089](https://github.com/apache/datafusion-python/pull/1089) (timsaucer) +- Documentation updates: mention correct dataset on basics page [#1081](https://github.com/apache/datafusion-python/pull/1081) (floscha) +- Add Configurable HTML Table Formatter for DataFusion DataFrames in Python [#1100](https://github.com/apache/datafusion-python/pull/1100) (kosiew) +- Add DataFrame usage guide with HTML rendering customization options [#1108](https://github.com/apache/datafusion-python/pull/1108) (kosiew) +- 1075/enhancement/Make col class with __getattr__ [#1076](https://github.com/apache/datafusion-python/pull/1076) (deanm0000) +- 1064/enhancement/add functions to Expr class [#1074](https://github.com/apache/datafusion-python/pull/1074) (deanm0000) +- ci: require approving review [#1122](https://github.com/apache/datafusion-python/pull/1122) (timsaucer) +- Partial fix for 1078: Enhance DataFrame Formatter Configuration with Memory and Display Controls [#1119](https://github.com/apache/datafusion-python/pull/1119) (kosiew) +- Add fill_null method to DataFrame API for handling missing values [#1019](https://github.com/apache/datafusion-python/pull/1019) (kosiew) +- minor: reduce error size [#1126](https://github.com/apache/datafusion-python/pull/1126) (timsaucer) +- Move the udf module to user_defined [#1112](https://github.com/apache/datafusion-python/pull/1112) (timsaucer) +- add unit tests for expression functions [#1121](https://github.com/apache/datafusion-python/pull/1121) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 12 Tim Saucer + 4 Chen Chongchen + 4 kosiew + 2 deanm0000 + 1 Florian Schäfer +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/48.0.0.md b/dev/changelog/48.0.0.md new file mode 100644 index 000000000..80bc61aca --- /dev/null +++ b/dev/changelog/48.0.0.md @@ -0,0 +1,59 @@ + + +# Apache DataFusion Python 48.0.0 Changelog + +This release consists of 15 commits from 6 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: upgrade df48 dependency [#1143](https://github.com/apache/datafusion-python/pull/1143) (timsaucer) +- feat: Support Parquet writer options [#1123](https://github.com/apache/datafusion-python/pull/1123) (nuno-faria) +- feat: dataframe string formatter [#1170](https://github.com/apache/datafusion-python/pull/1170) (timsaucer) +- feat: collect once during display() in jupyter notebooks [#1167](https://github.com/apache/datafusion-python/pull/1167) (timsaucer) +- feat: python based catalog and schema provider [#1156](https://github.com/apache/datafusion-python/pull/1156) (timsaucer) +- feat: add FFI support for user defined functions [#1145](https://github.com/apache/datafusion-python/pull/1145) (timsaucer) + +**Other:** + +- Release DataFusion 47.0.0 [#1130](https://github.com/apache/datafusion-python/pull/1130) (timsaucer) +- Add a documentation build step in CI [#1139](https://github.com/apache/datafusion-python/pull/1139) (crystalxyz) +- Add DataFrame API Documentation for DataFusion Python [#1132](https://github.com/apache/datafusion-python/pull/1132) (kosiew) +- Add Interruptible Query Execution in Jupyter via KeyboardInterrupt Support [#1141](https://github.com/apache/datafusion-python/pull/1141) (kosiew) +- Support types other than String and Int for partition columns [#1154](https://github.com/apache/datafusion-python/pull/1154) (miclegr) +- Fix signature of `__arrow_c_stream__` [#1168](https://github.com/apache/datafusion-python/pull/1168) (kylebarron) +- Consolidate DataFrame Docs: Merge HTML Rendering Section as Subpage [#1161](https://github.com/apache/datafusion-python/pull/1161) (kosiew) +- Add compression_level support to ParquetWriterOptions and enhance write_parquet to accept full options object [#1169](https://github.com/apache/datafusion-python/pull/1169) (kosiew) +- Simplify HTML Formatter Style Handling Using Script Injection [#1177](https://github.com/apache/datafusion-python/pull/1177) (kosiew) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 6 Tim Saucer + 5 kosiew + 1 Crystal Zhou + 1 Kyle Barron + 1 Michele Gregori + 1 Nuno Faria +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/49.0.0.md b/dev/changelog/49.0.0.md new file mode 100644 index 000000000..008bd43bc --- /dev/null +++ b/dev/changelog/49.0.0.md @@ -0,0 +1,61 @@ + + +# Apache DataFusion Python 49.0.0 Changelog + +This release consists of 16 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Fixed bugs:** + +- fix(build): Include build.rs in published crates [#1199](https://github.com/apache/datafusion-python/pull/1199) (colinmarc) + +**Other:** + +- 48.0.0 Release [#1175](https://github.com/apache/datafusion-python/pull/1175) (timsaucer) +- Update CI rules [#1188](https://github.com/apache/datafusion-python/pull/1188) (timsaucer) +- Fix Python UDAF Accumulator Interface example to Properly Handle State and Updates with List[Array] Types [#1192](https://github.com/apache/datafusion-python/pull/1192) (kosiew) +- chore: Upgrade datafusion to version 49 [#1200](https://github.com/apache/datafusion-python/pull/1200) (nuno-faria) +- Update how to dev instructions [#1179](https://github.com/apache/datafusion-python/pull/1179) (ntjohnson1) +- build(deps): bump object_store from 0.12.2 to 0.12.3 [#1189](https://github.com/apache/datafusion-python/pull/1189) (dependabot[bot]) +- build(deps): bump uuid from 1.17.0 to 1.18.0 [#1202](https://github.com/apache/datafusion-python/pull/1202) (dependabot[bot]) +- build(deps): bump async-trait from 0.1.88 to 0.1.89 [#1203](https://github.com/apache/datafusion-python/pull/1203) (dependabot[bot]) +- build(deps): bump slab from 0.4.10 to 0.4.11 [#1205](https://github.com/apache/datafusion-python/pull/1205) (dependabot[bot]) +- Improved window and aggregate function signature [#1187](https://github.com/apache/datafusion-python/pull/1187) (timsaucer) +- Optional improvements in verification instructions [#1183](https://github.com/apache/datafusion-python/pull/1183) (paleolimbot) +- Improve `show()` output for empty DataFrames [#1208](https://github.com/apache/datafusion-python/pull/1208) (kosiew) +- build(deps): bump actions/download-artifact from 4 to 5 [#1201](https://github.com/apache/datafusion-python/pull/1201) (dependabot[bot]) +- build(deps): bump url from 2.5.4 to 2.5.7 [#1210](https://github.com/apache/datafusion-python/pull/1210) (dependabot[bot]) +- build(deps): bump actions/checkout from 4 to 5 [#1204](https://github.com/apache/datafusion-python/pull/1204) (dependabot[bot]) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 dependabot[bot] + 3 Tim Saucer + 2 kosiew + 1 Colin Marc + 1 Dewey Dunnington + 1 Nick + 1 Nuno Faria +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/50.0.0.md b/dev/changelog/50.0.0.md new file mode 100644 index 000000000..c3f09d180 --- /dev/null +++ b/dev/changelog/50.0.0.md @@ -0,0 +1,60 @@ + + +# Apache DataFusion Python 50.0.0 Changelog + +This release consists of 12 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: allow passing a slice to and expression with the [] indexing [#1215](https://github.com/apache/datafusion-python/pull/1215) (timsaucer) + +**Documentation updates:** + +- docs: fix CaseBuilder documentation example [#1225](https://github.com/apache/datafusion-python/pull/1225) (IndexSeek) +- docs: update link to user example for custom table provider [#1224](https://github.com/apache/datafusion-python/pull/1224) (IndexSeek) +- docs: add apache iceberg as datafusion data source [#1240](https://github.com/apache/datafusion-python/pull/1240) (kevinjqliu) + +**Other:** + +- 49.0.0 release [#1211](https://github.com/apache/datafusion-python/pull/1211) (timsaucer) +- Update development guide in README.md [#1213](https://github.com/apache/datafusion-python/pull/1213) (YKoustubhRao) +- Add benchmark script and documentation for maximizing CPU usage in DataFusion Python [#1216](https://github.com/apache/datafusion-python/pull/1216) (kosiew) +- Fixing a few Typos [#1220](https://github.com/apache/datafusion-python/pull/1220) (ntjohnson1) +- Set fail on warning for documentation generation [#1218](https://github.com/apache/datafusion-python/pull/1218) (timsaucer) +- chore: remove redundant error transformation [#1232](https://github.com/apache/datafusion-python/pull/1232) (mesejo) +- Support string column identifiers for sort/aggregate/window and stricter Expr validation [#1221](https://github.com/apache/datafusion-python/pull/1221) (kosiew) +- Prepare for DF50 [#1231](https://github.com/apache/datafusion-python/pull/1231) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 4 Tim Saucer + 2 Tyler White + 2 kosiew + 1 Daniel Mesejo + 1 Kevin Liu + 1 Koustubh Rao + 1 Nick +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/50.1.0.md b/dev/changelog/50.1.0.md new file mode 100644 index 000000000..3b9ff84ff --- /dev/null +++ b/dev/changelog/50.1.0.md @@ -0,0 +1,57 @@ + + +# Apache DataFusion Python 50.1.0 Changelog + +This release consists of 11 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Breaking changes:** + +- Unify Table representations [#1256](https://github.com/apache/datafusion-python/pull/1256) (timsaucer) + +**Implemented enhancements:** + +- feat: expose DataFrame.write_table [#1264](https://github.com/apache/datafusion-python/pull/1264) (timsaucer) +- feat: expose` DataFrame.parse_sql_expr` [#1274](https://github.com/apache/datafusion-python/pull/1274) (milenkovicm) + +**Other:** + +- Update version number, add changelog [#1249](https://github.com/apache/datafusion-python/pull/1249) (timsaucer) +- Fix drop() method to handle quoted column names consistently [#1242](https://github.com/apache/datafusion-python/pull/1242) (H0TB0X420) +- Make Session Context `pyclass` frozen so interior mutability is only managed by rust [#1248](https://github.com/apache/datafusion-python/pull/1248) (ntjohnson1) +- macos-13 is deprecated [#1259](https://github.com/apache/datafusion-python/pull/1259) (kevinjqliu) +- Freeze PyO3 wrappers & introduce interior mutability to avoid PyO3 borrow errors [#1253](https://github.com/apache/datafusion-python/pull/1253) (kosiew) +- chore: update dependencies [#1269](https://github.com/apache/datafusion-python/pull/1269) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 4 Tim Saucer + 2 Siew Kam Onn + 1 H0TB0X420 + 1 Kevin Liu + 1 Marko Milenković + 1 Nick + 1 kosiew +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/51.0.0.md b/dev/changelog/51.0.0.md new file mode 100644 index 000000000..cc157eb0d --- /dev/null +++ b/dev/changelog/51.0.0.md @@ -0,0 +1,74 @@ + + +# Apache DataFusion Python 51.0.0 Changelog + +This release consists of 23 commits from 7 contributors. See credits at the end of this changelog for more information. + +**Breaking changes:** + +- feat: reduce duplicate fields on join [#1184](https://github.com/apache/datafusion-python/pull/1184) (timsaucer) + +**Implemented enhancements:** + +- feat: expose `select_exprs` method on DataFrame [#1271](https://github.com/apache/datafusion-python/pull/1271) (milenkovicm) +- feat: allow DataFrame.filter to accept SQL strings [#1276](https://github.com/apache/datafusion-python/pull/1276) (K-dash) +- feat: add temporary view option for into_view [#1267](https://github.com/apache/datafusion-python/pull/1267) (timsaucer) +- feat: support session token parameter for AmazonS3 [#1275](https://github.com/apache/datafusion-python/pull/1275) (GCHQDeveloper028) +- feat: `with_column` supports SQL expression [#1284](https://github.com/apache/datafusion-python/pull/1284) (milenkovicm) +- feat: Add SQL expression for `repartition_by_hash` [#1285](https://github.com/apache/datafusion-python/pull/1285) (milenkovicm) +- feat: Add SQL expression support for `with_columns` [#1286](https://github.com/apache/datafusion-python/pull/1286) (milenkovicm) + +**Fixed bugs:** + +- fix: use coalesce instead of drop_duplicate_keys for join [#1318](https://github.com/apache/datafusion-python/pull/1318) (mesejo) +- fix: Inconsistent schemas when converting to pyarrow [#1315](https://github.com/apache/datafusion-python/pull/1315) (nuno-faria) + +**Other:** + +- Release 50.1 [#1281](https://github.com/apache/datafusion-python/pull/1281) (timsaucer) +- Update python minimum version to 3.10 [#1296](https://github.com/apache/datafusion-python/pull/1296) (timsaucer) +- chore: update datafusion minor version [#1297](https://github.com/apache/datafusion-python/pull/1297) (timsaucer) +- Enable remaining pylints [#1298](https://github.com/apache/datafusion-python/pull/1298) (timsaucer) +- Add Arrow C streaming, DataFrame iteration, and OOM-safe streaming execution [#1222](https://github.com/apache/datafusion-python/pull/1222) (kosiew) +- Add PyCapsule Type Support and Type Hint Enhancements for AggregateUDF in DataFusion Python Bindings [#1277](https://github.com/apache/datafusion-python/pull/1277) (kosiew) +- Add collect_column to dataframe [#1302](https://github.com/apache/datafusion-python/pull/1302) (timsaucer) +- chore: apply cargo fmt with import organization [#1303](https://github.com/apache/datafusion-python/pull/1303) (timsaucer) +- Feat/parameterized sql queries [#964](https://github.com/apache/datafusion-python/pull/964) (timsaucer) +- Upgrade to Datafusion 51 [#1311](https://github.com/apache/datafusion-python/pull/1311) (nuno-faria) +- minor: resolve build errors after latest merge into main [#1325](https://github.com/apache/datafusion-python/pull/1325) (timsaucer) +- Update build workflow link [#1330](https://github.com/apache/datafusion-python/pull/1330) (timsaucer) +- Do not convert pyarrow scalar values to plain python types when passing as `lit` [#1319](https://github.com/apache/datafusion-python/pull/1319) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 12 Tim Saucer + 4 Marko Milenković + 2 Nuno Faria + 2 kosiew + 1 Daniel Mesejo + 1 GCHQDeveloper028 + 1 𝕂 +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/52.0.0.md b/dev/changelog/52.0.0.md new file mode 100644 index 000000000..3f848bb47 --- /dev/null +++ b/dev/changelog/52.0.0.md @@ -0,0 +1,78 @@ + + +# Apache DataFusion Python 52.0.0 Changelog + +This release consists of 26 commits from 9 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: add CatalogProviderList support [#1363](https://github.com/apache/datafusion-python/pull/1363) (timsaucer) +- feat: add support for generating JSON formatted substrait plan [#1376](https://github.com/apache/datafusion-python/pull/1376) (Prathamesh9284) +- feat: add regexp_instr function [#1382](https://github.com/apache/datafusion-python/pull/1382) (mesejo) + +**Fixed bugs:** + +- fix: mangled errors [#1377](https://github.com/apache/datafusion-python/pull/1377) (mesejo) + +**Documentation updates:** + +- docs: Clarify first_value usage in select vs aggregate [#1348](https://github.com/apache/datafusion-python/pull/1348) (AdMub) + +**Other:** + +- Release 51.0.0 [#1333](https://github.com/apache/datafusion-python/pull/1333) (timsaucer) +- Use explicit timer in unit test [#1338](https://github.com/apache/datafusion-python/pull/1338) (timsaucer) +- Add use_fabric_endpoint parameter to MicrosoftAzure class [#1357](https://github.com/apache/datafusion-python/pull/1357) (djouallah) +- Prepare for DF52 release [#1337](https://github.com/apache/datafusion-python/pull/1337) (timsaucer) +- build(deps): bump actions/checkout from 5 to 6 [#1310](https://github.com/apache/datafusion-python/pull/1310) (dependabot[bot]) +- build(deps): bump actions/download-artifact from 5 to 7 [#1321](https://github.com/apache/datafusion-python/pull/1321) (dependabot[bot]) +- build(deps): bump actions/upload-artifact from 4 to 6 [#1322](https://github.com/apache/datafusion-python/pull/1322) (dependabot[bot]) +- build(deps): bump actions/cache from 4 to 5 [#1323](https://github.com/apache/datafusion-python/pull/1323) (dependabot[bot]) +- Pass Field information back and forth when using scalar UDFs [#1299](https://github.com/apache/datafusion-python/pull/1299) (timsaucer) +- Update dependency minor versions to prepare for DF52 release [#1368](https://github.com/apache/datafusion-python/pull/1368) (timsaucer) +- Improve displayed error by using `DataFusionError`'s `Display` trait [#1370](https://github.com/apache/datafusion-python/pull/1370) (abey79) +- Enforce DataFrame display memory limits with `max_rows` + `min_rows` constraint (deprecate `repr_rows`) [#1367](https://github.com/apache/datafusion-python/pull/1367) (kosiew) +- Implement all CSV reader options [#1361](https://github.com/apache/datafusion-python/pull/1361) (timsaucer) +- chore: add confirmation before tarball is released [#1372](https://github.com/apache/datafusion-python/pull/1372) (milenkovicm) +- Build in debug mode for PRs [#1375](https://github.com/apache/datafusion-python/pull/1375) (timsaucer) +- minor: remove ffi test wheel from distribution artifact [#1378](https://github.com/apache/datafusion-python/pull/1378) (timsaucer) +- chore: update rust 2024 edition [#1371](https://github.com/apache/datafusion-python/pull/1371) (timsaucer) +- Fix Python UDAF list-of-timestamps return by enforcing list-valued scalars and caching PyArrow types [#1347](https://github.com/apache/datafusion-python/pull/1347) (kosiew) +- minor: update cargo dependencies [#1383](https://github.com/apache/datafusion-python/pull/1383) (timsaucer) +- chore: bump Python version for RAT checking [#1386](https://github.com/apache/datafusion-python/pull/1386) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 13 Tim Saucer + 4 dependabot[bot] + 2 Daniel Mesejo + 2 kosiew + 1 Adisa Mubarak (AdMub) + 1 Antoine Beyeler + 1 Dhanashri Prathamesh Iranna + 1 Marko Milenković + 1 Mimoune +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + diff --git a/dev/changelog/pre-43.0.0.md b/dev/changelog/pre-43.0.0.md new file mode 100644 index 000000000..ae3a2348a --- /dev/null +++ b/dev/changelog/pre-43.0.0.md @@ -0,0 +1,715 @@ + + +# DataFusion Python Changelog + +## [42.0.0](https://github.com/apache/datafusion-python/tree/42.0.0) (2024-10-06) + +This release consists of 20 commits from 6 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: expose between [#868](https://github.com/apache/datafusion-python/pull/868) (mesejo) +- feat: make register_csv accept a list of paths [#883](https://github.com/apache/datafusion-python/pull/883) (mesejo) +- feat: expose http object store [#885](https://github.com/apache/datafusion-python/pull/885) (mesejo) + +**Fixed bugs:** + +- fix: Calling `count` on a pyarrow dataset results in an error [#843](https://github.com/apache/datafusion-python/pull/843) (Michael-J-Ward) + +**Other:** + +- Upgrade datafusion [#867](https://github.com/apache/datafusion-python/pull/867) (emgeee) +- Feature/aggregates as windows [#871](https://github.com/apache/datafusion-python/pull/871) (timsaucer) +- Fix regression on register_udaf [#878](https://github.com/apache/datafusion-python/pull/878) (timsaucer) +- build(deps): upgrade setup-protoc action and protoc version number [#873](https://github.com/apache/datafusion-python/pull/873) (Michael-J-Ward) +- build(deps): bump prost-types from 0.13.2 to 0.13.3 [#881](https://github.com/apache/datafusion-python/pull/881) (dependabot[bot]) +- build(deps): bump prost from 0.13.2 to 0.13.3 [#882](https://github.com/apache/datafusion-python/pull/882) (dependabot[bot]) +- chore: remove XFAIL from passing tests [#884](https://github.com/apache/datafusion-python/pull/884) (Michael-J-Ward) +- Add user defined window function support [#880](https://github.com/apache/datafusion-python/pull/880) (timsaucer) +- build(deps): bump syn from 2.0.77 to 2.0.79 [#886](https://github.com/apache/datafusion-python/pull/886) (dependabot[bot]) +- fix example of reading parquet from s3 [#896](https://github.com/apache/datafusion-python/pull/896) (sir-sigurd) +- release-testing [#889](https://github.com/apache/datafusion-python/pull/889) (Michael-J-Ward) +- chore(bench): fix create_tables.sql for tpch benchmark [#897](https://github.com/apache/datafusion-python/pull/897) (Michael-J-Ward) +- Add physical and logical plan conversion to and from protobuf [#892](https://github.com/apache/datafusion-python/pull/892) (timsaucer) +- Feature/instance udfs [#890](https://github.com/apache/datafusion-python/pull/890) (timsaucer) +- chore(ci): remove Mambaforge variant from CI [#894](https://github.com/apache/datafusion-python/pull/894) (Michael-J-Ward) +- Use OnceLock to store TokioRuntime [#895](https://github.com/apache/datafusion-python/pull/895) (Michael-J-Ward) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Michael J Ward + 5 Tim Saucer + 3 Daniel Mesejo + 3 dependabot[bot] + 1 Matt Green + 1 Sergey Fedoseev +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + +## [41.0.0](https://github.com/apache/datafusion-python/tree/41.0.0) (2024-09-09) + +This release consists of 19 commits from 6 contributors. See credits at the end of this changelog for more information. + +**Implemented enhancements:** + +- feat: enable list of paths for read_csv [#824](https://github.com/apache/datafusion-python/pull/824) (mesejo) +- feat: better exception and message for table not found [#851](https://github.com/apache/datafusion-python/pull/851) (mesejo) +- feat: make cast accept built-in Python types [#858](https://github.com/apache/datafusion-python/pull/858) (mesejo) + +**Other:** + +- chore: Prepare for 40.0.0 release [#801](https://github.com/apache/datafusion-python/pull/801) (andygrove) +- Add typing-extensions dependency to pyproject [#805](https://github.com/apache/datafusion-python/pull/805) (timsaucer) +- Upgrade deps to datafusion 41 [#802](https://github.com/apache/datafusion-python/pull/802) (Michael-J-Ward) +- Fix SessionContext init with only SessionConfig [#827](https://github.com/apache/datafusion-python/pull/827) (jcrist) +- build(deps): upgrade actions/{upload,download}-artifact@v3 to v4 [#829](https://github.com/apache/datafusion-python/pull/829) (Michael-J-Ward) +- Run ruff format in CI [#837](https://github.com/apache/datafusion-python/pull/837) (timsaucer) +- Add PyCapsule support for Arrow import and export [#825](https://github.com/apache/datafusion-python/pull/825) (timsaucer) +- Feature/expose when function [#836](https://github.com/apache/datafusion-python/pull/836) (timsaucer) +- Add Window Functions for use with function builder [#808](https://github.com/apache/datafusion-python/pull/808) (timsaucer) +- chore: fix typos [#844](https://github.com/apache/datafusion-python/pull/844) (mesejo) +- build(ci): use proper mac runners [#841](https://github.com/apache/datafusion-python/pull/841) (Michael-J-Ward) +- Set of small features [#839](https://github.com/apache/datafusion-python/pull/839) (timsaucer) +- chore: fix docstrings, typos [#852](https://github.com/apache/datafusion-python/pull/852) (mesejo) +- chore: Use datafusion re-exported dependencies [#856](https://github.com/apache/datafusion-python/pull/856) (emgeee) +- add guidelines on separating python and rust code [#860](https://github.com/apache/datafusion-python/pull/860) (Michael-J-Ward) +- Update Aggregate functions to take builder parameters [#859](https://github.com/apache/datafusion-python/pull/859) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 7 Tim Saucer + 5 Daniel Mesejo + 4 Michael J Ward + 1 Andy Grove + 1 Jim Crist-Harif + 1 Matt Green +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + +## [40.0.0](https://github.com/apache/datafusion-python/tree/40.0.0) (2024-08-09) + +This release consists of 18 commits from 4 contributors. See credits at the end of this changelog for more information. + +- Update changelog for 39.0.0 [#742](https://github.com/apache/datafusion-python/pull/742) (andygrove) +- build(deps): bump uuid from 1.8.0 to 1.9.1 [#744](https://github.com/apache/datafusion-python/pull/744) (dependabot[bot]) +- build(deps): bump mimalloc from 0.1.42 to 0.1.43 [#745](https://github.com/apache/datafusion-python/pull/745) (dependabot[bot]) +- build(deps): bump syn from 2.0.67 to 2.0.68 [#746](https://github.com/apache/datafusion-python/pull/746) (dependabot[bot]) +- Tsaucer/find window fn [#747](https://github.com/apache/datafusion-python/pull/747) (timsaucer) +- Python wrapper classes for all user interfaces [#750](https://github.com/apache/datafusion-python/pull/750) (timsaucer) +- Expose array sort [#764](https://github.com/apache/datafusion-python/pull/764) (timsaucer) +- Upgrade protobuf and remove GH Action googletest-installer [#773](https://github.com/apache/datafusion-python/pull/773) (Michael-J-Ward) +- Upgrade Datafusion 40 [#771](https://github.com/apache/datafusion-python/pull/771) (Michael-J-Ward) +- Bugfix: Calling count with None arguments [#768](https://github.com/apache/datafusion-python/pull/768) (timsaucer) +- Add in user example that compares a two different approaches to UDFs [#770](https://github.com/apache/datafusion-python/pull/770) (timsaucer) +- Add missing exports for wrapper modules [#782](https://github.com/apache/datafusion-python/pull/782) (timsaucer) +- Add PyExpr to_variant conversions [#793](https://github.com/apache/datafusion-python/pull/793) (Michael-J-Ward) +- Add missing expressions to wrapper export [#795](https://github.com/apache/datafusion-python/pull/795) (timsaucer) +- Doc/cross reference [#791](https://github.com/apache/datafusion-python/pull/791) (timsaucer) +- Re-Enable `num_centroids` to `approx_percentile_cont` [#798](https://github.com/apache/datafusion-python/pull/798) (Michael-J-Ward) +- UDAF process all state variables [#799](https://github.com/apache/datafusion-python/pull/799) (timsaucer) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 9 Tim Saucer + 4 Michael J Ward + 3 dependabot[bot] + 2 Andy Grove +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. + +## [39.0.0](https://github.com/apache/datafusion-python/tree/39.0.0) (2024-06-25) + +**Merged pull requests:** + +- ci: add substrait feature to linux builds [#720](https://github.com/apache/datafusion-python/pull/720) (Michael-J-Ward) +- Docs deploy action [#721](https://github.com/apache/datafusion-python/pull/721) (Michael-J-Ward) +- update deps [#723](https://github.com/apache/datafusion-python/pull/723) (Michael-J-Ward) +- Upgrade maturin [#725](https://github.com/apache/datafusion-python/pull/725) (Michael-J-Ward) +- Upgrade datafusion 39 [#728](https://github.com/apache/datafusion-python/pull/728) (Michael-J-Ward) +- use ScalarValue::to_pyarrow to convert to python object [#731](https://github.com/apache/datafusion-python/pull/731) (Michael-J-Ward) +- Pyo3 `Bound<'py, T>` api [#734](https://github.com/apache/datafusion-python/pull/734) (Michael-J-Ward) +- github test action: drop python 3.7, add python 3.12 [#736](https://github.com/apache/datafusion-python/pull/736) (Michael-J-Ward) +- Pyarrow filter pushdowns [#735](https://github.com/apache/datafusion-python/pull/735) (Michael-J-Ward) +- build(deps): bump syn from 2.0.66 to 2.0.67 [#738](https://github.com/apache/datafusion-python/pull/738) (dependabot[bot]) +- Pyo3 refactorings [#740](https://github.com/apache/datafusion-python/pull/740) (Michael-J-Ward) +- UDAF `sum` workaround [#741](https://github.com/apache/datafusion-python/pull/741) (Michael-J-Ward) + +## [38.0.1](https://github.com/apache/datafusion-python/tree/38.0.1) (2024-05-25) + +**Implemented enhancements:** + +- feat: add python bindings for ends_with function [#693](https://github.com/apache/datafusion-python/pull/693) (richtia) +- feat: expose `named_struct` in python [#700](https://github.com/apache/datafusion-python/pull/700) (Michael-J-Ward) + +**Merged pull requests:** + +- Add document about basics of working with expressions [#668](https://github.com/apache/datafusion-python/pull/668) (timsaucer) +- chore: Update Python release process now that DataFusion is TLP [#674](https://github.com/apache/datafusion-python/pull/674) (andygrove) +- Fix Docs [#676](https://github.com/apache/datafusion-python/pull/676) (Michael-J-Ward) +- Add examples from TPC-H [#666](https://github.com/apache/datafusion-python/pull/666) (timsaucer) +- fix conda nightly builds, attempt 2 [#689](https://github.com/apache/datafusion-python/pull/689) (Michael-J-Ward) +- Upgrade to datafusion 38 [#691](https://github.com/apache/datafusion-python/pull/691) (Michael-J-Ward) +- chore: update to maturin's recommended project layout for rust/python… [#695](https://github.com/apache/datafusion-python/pull/695) (Michael-J-Ward) +- chore: update cargo deps [#698](https://github.com/apache/datafusion-python/pull/698) (Michael-J-Ward) +- feat: add python bindings for ends_with function [#693](https://github.com/apache/datafusion-python/pull/693) (richtia) +- feat: expose `named_struct` in python [#700](https://github.com/apache/datafusion-python/pull/700) (Michael-J-Ward) +- Website fixes [#702](https://github.com/apache/datafusion-python/pull/702) (Michael-J-Ward) + +## [37.1.0](https://github.com/apache/datafusion-python/tree/37.1.0) (2024-05-08) + +**Implemented enhancements:** + +- feat: add execute_stream and execute_stream_partitioned [#610](https://github.com/apache/datafusion-python/pull/610) (mesejo) + +**Documentation updates:** + +- docs: update docs CI to install python-311 requirements [#661](https://github.com/apache/datafusion-python/pull/661) (Michael-J-Ward) + +**Merged pull requests:** + +- Switch to Ruff for Python linting [#529](https://github.com/apache/datafusion-python/pull/529) (andygrove) +- Remove sql-on-pandas/polars/cudf examples [#602](https://github.com/apache/datafusion-python/pull/602) (andygrove) +- build(deps): bump object_store from 0.9.0 to 0.9.1 [#611](https://github.com/apache/datafusion-python/pull/611) (dependabot[bot]) +- More missing array funcs [#605](https://github.com/apache/datafusion-python/pull/605) (judahrand) +- feat: add execute_stream and execute_stream_partitioned [#610](https://github.com/apache/datafusion-python/pull/610) (mesejo) +- build(deps): bump uuid from 1.7.0 to 1.8.0 [#615](https://github.com/apache/datafusion-python/pull/615) (dependabot[bot]) +- Bind SQLOptions and relative ctx method #567 [#588](https://github.com/apache/datafusion-python/pull/588) (giacomorebecchi) +- bugfix: no panic on empty table [#613](https://github.com/apache/datafusion-python/pull/613) (mesejo) +- Expose `register_listing_table` [#618](https://github.com/apache/datafusion-python/pull/618) (henrifroese) +- Expose unnest feature [#641](https://github.com/apache/datafusion-python/pull/641) (timsaucer) +- Update domain names and paths in asf yaml [#643](https://github.com/apache/datafusion-python/pull/643) (andygrove) +- use python 3.11 to publish docs [#645](https://github.com/apache/datafusion-python/pull/645) (andygrove) +- docs: update docs CI to install python-311 requirements [#661](https://github.com/apache/datafusion-python/pull/661) (Michael-J-Ward) +- Upgrade Datafusion to v37.1.0 [#669](https://github.com/apache/datafusion-python/pull/669) (Michael-J-Ward) + +## [36.0.0](https://github.com/apache/datafusion-python/tree/36.0.0) (2024-03-02) + +**Implemented enhancements:** + +- feat: Add `flatten` array function [#562](https://github.com/apache/datafusion-python/pull/562) (mobley-trent) + +**Documentation updates:** + +- docs: Add ASF attribution [#580](https://github.com/apache/datafusion-python/pull/580) (simicd) + +**Merged pull requests:** + +- Allow PyDataFrame to be used from other projects [#582](https://github.com/apache/datafusion-python/pull/582) (andygrove) +- docs: Add ASF attribution [#580](https://github.com/apache/datafusion-python/pull/580) (simicd) +- Add array functions [#560](https://github.com/apache/datafusion-python/pull/560) (ongchi) +- feat: Add `flatten` array function [#562](https://github.com/apache/datafusion-python/pull/562) (mobley-trent) + +## [35.0.0](https://github.com/apache/datafusion-python/tree/35.0.0) (2024-01-20) + +**Merged pull requests:** + +- build(deps): bump syn from 2.0.41 to 2.0.43 [#559](https://github.com/apache/datafusion-python/pull/559) (dependabot[bot]) +- build(deps): bump tokio from 1.35.0 to 1.35.1 [#558](https://github.com/apache/datafusion-python/pull/558) (dependabot[bot]) +- build(deps): bump async-trait from 0.1.74 to 0.1.77 [#556](https://github.com/apache/datafusion-python/pull/556) (dependabot[bot]) +- build(deps): bump pyo3 from 0.20.0 to 0.20.2 [#557](https://github.com/apache/datafusion-python/pull/557) (dependabot[bot]) + +## [34.0.0](https://github.com/apache/datafusion-python/tree/34.0.0) (2023-12-28) + +**Merged pull requests:** + +- Adjust visibility of crate private members & Functions [#537](https://github.com/apache/datafusion-python/pull/537) (jdye64) +- Update json.rst [#538](https://github.com/apache/datafusion-python/pull/538) (ray-andrew) +- Enable mimalloc local_dynamic_tls feature [#540](https://github.com/apache/datafusion-python/pull/540) (jdye64) +- Enable substrait feature to be built by default in CI, for nightlies … [#544](https://github.com/apache/datafusion-python/pull/544) (jdye64) + +## [33.0.0](https://github.com/apache/datafusion-python/tree/33.0.0) (2023-11-16) + +**Merged pull requests:** + +- First pass at getting architectured builds working [#350](https://github.com/apache/datafusion-python/pull/350) (charlesbluca) +- Remove libprotobuf dep [#527](https://github.com/apache/datafusion-python/pull/527) (jdye64) + +## [32.0.0](https://github.com/apache/datafusion-python/tree/32.0.0) (2023-10-21) + +**Implemented enhancements:** + +- feat: expose PyWindowFrame [#509](https://github.com/apache/datafusion-python/pull/509) (dlovell) +- add Binary String Functions;encode,decode [#494](https://github.com/apache/datafusion-python/pull/494) (jiangzhx) +- add bit_and,bit_or,bit_xor,bool_add,bool_or [#496](https://github.com/apache/datafusion-python/pull/496) (jiangzhx) +- add first_value last_value [#498](https://github.com/apache/datafusion-python/pull/498) (jiangzhx) +- add regr\_\* functions [#499](https://github.com/apache/datafusion-python/pull/499) (jiangzhx) +- Add random missing bindings [#522](https://github.com/apache/datafusion-python/pull/522) (jdye64) +- Allow for multiple input files per table instead of a single file [#519](https://github.com/apache/datafusion-python/pull/519) (jdye64) +- Add support for window function bindings [#521](https://github.com/apache/datafusion-python/pull/521) (jdye64) + +**Merged pull requests:** + +- Prepare 31.0.0 release [#500](https://github.com/apache/datafusion-python/pull/500) (andygrove) +- Improve release process documentation [#505](https://github.com/apache/datafusion-python/pull/505) (andygrove) +- add Binary String Functions;encode,decode [#494](https://github.com/apache/datafusion-python/pull/494) (jiangzhx) +- build(deps): bump mimalloc from 0.1.38 to 0.1.39 [#502](https://github.com/apache/datafusion-python/pull/502) (dependabot[bot]) +- build(deps): bump syn from 2.0.32 to 2.0.35 [#503](https://github.com/apache/datafusion-python/pull/503) (dependabot[bot]) +- build(deps): bump syn from 2.0.35 to 2.0.37 [#506](https://github.com/apache/datafusion-python/pull/506) (dependabot[bot]) +- Use latest DataFusion [#511](https://github.com/apache/datafusion-python/pull/511) (andygrove) +- add bit_and,bit_or,bit_xor,bool_add,bool_or [#496](https://github.com/apache/datafusion-python/pull/496) (jiangzhx) +- use DataFusion 32 [#515](https://github.com/apache/datafusion-python/pull/515) (andygrove) +- add first_value last_value [#498](https://github.com/apache/datafusion-python/pull/498) (jiangzhx) +- build(deps): bump regex-syntax from 0.7.5 to 0.8.1 [#517](https://github.com/apache/datafusion-python/pull/517) (dependabot[bot]) +- build(deps): bump pyo3-build-config from 0.19.2 to 0.20.0 [#516](https://github.com/apache/datafusion-python/pull/516) (dependabot[bot]) +- add regr\_\* functions [#499](https://github.com/apache/datafusion-python/pull/499) (jiangzhx) +- Add random missing bindings [#522](https://github.com/apache/datafusion-python/pull/522) (jdye64) +- build(deps): bump rustix from 0.38.18 to 0.38.19 [#523](https://github.com/apache/datafusion-python/pull/523) (dependabot[bot]) +- Allow for multiple input files per table instead of a single file [#519](https://github.com/apache/datafusion-python/pull/519) (jdye64) +- Add support for window function bindings [#521](https://github.com/apache/datafusion-python/pull/521) (jdye64) +- Small clippy fix [#524](https://github.com/apache/datafusion-python/pull/524) (andygrove) + +## [31.0.0](https://github.com/apache/datafusion-python/tree/31.0.0) (2023-09-12) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/28.0.0...31.0.0) + +**Implemented enhancements:** + +- feat: add case function (#447) [#448](https://github.com/apache/datafusion-python/pull/448) (mesejo) +- feat: add compression options [#456](https://github.com/apache/datafusion-python/pull/456) (mesejo) +- feat: add register_json [#458](https://github.com/apache/datafusion-python/pull/458) (mesejo) +- feat: add basic compression configuration to write_parquet [#459](https://github.com/apache/datafusion-python/pull/459) (mesejo) +- feat: add example of reading parquet from s3 [#460](https://github.com/apache/datafusion-python/pull/460) (mesejo) +- feat: add register_avro and read_table [#461](https://github.com/apache/datafusion-python/pull/461) (mesejo) +- feat: add missing scalar math functions [#465](https://github.com/apache/datafusion-python/pull/465) (mesejo) + +**Documentation updates:** + +- docs: include pre-commit hooks section in contributor guide [#455](https://github.com/apache/datafusion-python/pull/455) (mesejo) + +**Merged pull requests:** + +- Build Linux aarch64 wheel [#443](https://github.com/apache/datafusion-python/pull/443) (gokselk) +- feat: add case function (#447) [#448](https://github.com/apache/datafusion-python/pull/448) (mesejo) +- enhancement(docs): Add user guide (#432) [#445](https://github.com/apache/datafusion-python/pull/445) (mesejo) +- docs: include pre-commit hooks section in contributor guide [#455](https://github.com/apache/datafusion-python/pull/455) (mesejo) +- feat: add compression options [#456](https://github.com/apache/datafusion-python/pull/456) (mesejo) +- Upgrade to DF 28.0.0-rc1 [#457](https://github.com/apache/datafusion-python/pull/457) (andygrove) +- feat: add register_json [#458](https://github.com/apache/datafusion-python/pull/458) (mesejo) +- feat: add basic compression configuration to write_parquet [#459](https://github.com/apache/datafusion-python/pull/459) (mesejo) +- feat: add example of reading parquet from s3 [#460](https://github.com/apache/datafusion-python/pull/460) (mesejo) +- feat: add register_avro and read_table [#461](https://github.com/apache/datafusion-python/pull/461) (mesejo) +- feat: add missing scalar math functions [#465](https://github.com/apache/datafusion-python/pull/465) (mesejo) +- build(deps): bump arduino/setup-protoc from 1 to 2 [#452](https://github.com/apache/datafusion-python/pull/452) (dependabot[bot]) +- Revert "build(deps): bump arduino/setup-protoc from 1 to 2 (#452)" [#474](https://github.com/apache/datafusion-python/pull/474) (viirya) +- Minor: fix wrongly copied function description [#497](https://github.com/apache/datafusion-python/pull/497) (viirya) +- Upgrade to Datafusion 31.0.0 [#491](https://github.com/apache/datafusion-python/pull/491) (judahrand) +- Add `isnan` and `iszero` [#495](https://github.com/apache/datafusion-python/pull/495) (judahrand) + +## 30.0.0 + +- Skipped due to a breaking change in DataFusion + +## 29.0.0 + +- Skipped + +## [28.0.0](https://github.com/apache/datafusion-python/tree/28.0.0) (2023-07-25) + +**Implemented enhancements:** + +- feat: expose offset in python API [#437](https://github.com/apache/datafusion-python/pull/437) (cpcloud) + +**Merged pull requests:** + +- File based input utils [#433](https://github.com/apache/datafusion-python/pull/433) (jdye64) +- Upgrade to 28.0.0-rc1 [#434](https://github.com/apache/datafusion-python/pull/434) (andygrove) +- Introduces utility for obtaining SqlTable information from a file like location [#398](https://github.com/apache/datafusion-python/pull/398) (jdye64) +- feat: expose offset in python API [#437](https://github.com/apache/datafusion-python/pull/437) (cpcloud) +- Use DataFusion 28 [#439](https://github.com/apache/datafusion-python/pull/439) (andygrove) + +## [27.0.0](https://github.com/apache/datafusion-python/tree/27.0.0) (2023-07-03) + +**Merged pull requests:** + +- LogicalPlan.to_variant() make public [#412](https://github.com/apache/datafusion-python/pull/412) (jdye64) +- Prepare 27.0.0 release [#423](https://github.com/apache/datafusion-python/pull/423) (andygrove) + +## [26.0.0](https://github.com/apache/datafusion-python/tree/26.0.0) (2023-06-11) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/25.0.0...26.0.0) + +**Merged pull requests:** + +- Add Expr::Case when_then_else support to rex_call_operands function [#388](https://github.com/apache/datafusion-python/pull/388) (jdye64) +- Introduce BaseSessionContext abstract class [#390](https://github.com/apache/datafusion-python/pull/390) (jdye64) +- CRUD Schema support for `BaseSessionContext` [#392](https://github.com/apache/datafusion-python/pull/392) (jdye64) +- CRUD Table support for `BaseSessionContext` [#394](https://github.com/apache/datafusion-python/pull/394) (jdye64) + +## [25.0.0](https://github.com/apache/datafusion-python/tree/25.0.0) (2023-05-23) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/24.0.0...25.0.0) + +**Merged pull requests:** + +- Prepare 24.0.0 Release [#376](https://github.com/apache/datafusion-python/pull/376) (andygrove) +- build(deps): bump uuid from 1.3.1 to 1.3.2 [#359](https://github.com/apache/datafusion-python/pull/359) (dependabot[bot]) +- build(deps): bump mimalloc from 0.1.36 to 0.1.37 [#361](https://github.com/apache/datafusion-python/pull/361) (dependabot[bot]) +- build(deps): bump regex-syntax from 0.6.29 to 0.7.1 [#334](https://github.com/apache/datafusion-python/pull/334) (dependabot[bot]) +- upgrade maturin to 0.15.1 [#379](https://github.com/apache/datafusion-python/pull/379) (Jimexist) +- Expand Expr to include RexType basic support [#378](https://github.com/apache/datafusion-python/pull/378) (jdye64) +- Add Python script for generating changelog [#383](https://github.com/apache/datafusion-python/pull/383) (andygrove) + +## [24.0.0](https://github.com/apache/datafusion-python/tree/24.0.0) (2023-05-09) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/23.0.0...24.0.0) + +**Documentation updates:** + +- Fix link to user guide [#354](https://github.com/apache/datafusion-python/pull/354) (andygrove) + +**Merged pull requests:** + +- Add interface to serialize Substrait plans to Python Bytes. [#344](https://github.com/apache/datafusion-python/pull/344) (kylebrooks-8451) +- Add partition_count property to ExecutionPlan. [#346](https://github.com/apache/datafusion-python/pull/346) (kylebrooks-8451) +- Remove unsendable from all Rust pyclass types. [#348](https://github.com/apache/datafusion-python/pull/348) (kylebrooks-8451) +- Fix link to user guide [#354](https://github.com/apache/datafusion-python/pull/354) (andygrove) +- Fix SessionContext execute. [#353](https://github.com/apache/datafusion-python/pull/353) (kylebrooks-8451) +- Pub mod expr in lib.rs [#357](https://github.com/apache/datafusion-python/pull/357) (jdye64) +- Add benchmark derived from TPC-H [#355](https://github.com/apache/datafusion-python/pull/355) (andygrove) +- Add db-benchmark [#365](https://github.com/apache/datafusion-python/pull/365) (andygrove) +- First pass of documentation in mdBook [#364](https://github.com/apache/datafusion-python/pull/364) (MrPowers) +- Add 'pub' and '#[pyo3(get, set)]' to DataTypeMap [#371](https://github.com/apache/datafusion-python/pull/371) (jdye64) +- Fix db-benchmark [#369](https://github.com/apache/datafusion-python/pull/369) (andygrove) +- Docs explaining how to view query plans [#373](https://github.com/apache/datafusion-python/pull/373) (andygrove) +- Improve db-benchmark [#372](https://github.com/apache/datafusion-python/pull/372) (andygrove) +- Make expr member of PyExpr public [#375](https://github.com/apache/datafusion-python/pull/375) (jdye64) + +## [23.0.0](https://github.com/apache/datafusion-python/tree/23.0.0) (2023-04-23) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/22.0.0...23.0.0) + +**Merged pull requests:** + +- Improve API docs, README, and examples for configuring context [#321](https://github.com/apache/datafusion-python/pull/321) (andygrove) +- Osx build linker args [#330](https://github.com/apache/datafusion-python/pull/330) (jdye64) +- Add requirements file for python 3.11 [#332](https://github.com/apache/datafusion-python/pull/332) (r4ntix) +- mac arm64 build [#338](https://github.com/apache/datafusion-python/pull/338) (andygrove) +- Add conda.yaml baseline workflow file [#281](https://github.com/apache/datafusion-python/pull/281) (jdye64) +- Prepare for 23.0.0 release [#335](https://github.com/apache/datafusion-python/pull/335) (andygrove) +- Reuse the Tokio Runtime [#341](https://github.com/apache/datafusion-python/pull/341) (kylebrooks-8451) + +## [22.0.0](https://github.com/apache/datafusion-python/tree/22.0.0) (2023-04-10) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/21.0.0...22.0.0) + +**Merged pull requests:** + +- Fix invalid build yaml [#308](https://github.com/apache/datafusion-python/pull/308) (andygrove) +- Try fix release build [#309](https://github.com/apache/datafusion-python/pull/309) (andygrove) +- Fix release build [#310](https://github.com/apache/datafusion-python/pull/310) (andygrove) +- Enable datafusion-substrait protoc feature, to remove compile-time dependency on protoc [#312](https://github.com/apache/datafusion-python/pull/312) (andygrove) +- Fix Mac/Win release builds in CI [#313](https://github.com/apache/datafusion-python/pull/313) (andygrove) +- install protoc in docs workflow [#314](https://github.com/apache/datafusion-python/pull/314) (andygrove) +- Fix documentation generation in CI [#315](https://github.com/apache/datafusion-python/pull/315) (andygrove) +- Source wheel fix [#319](https://github.com/apache/datafusion-python/pull/319) (andygrove) + +## [21.0.0](https://github.com/apache/datafusion-python/tree/21.0.0) (2023-03-30) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/20.0.0...21.0.0) + +**Merged pull requests:** + +- minor: Fix minor warning on unused import [#289](https://github.com/apache/datafusion-python/pull/289) (viirya) +- feature: Implement `describe()` method [#293](https://github.com/apache/datafusion-python/pull/293) (simicd) +- fix: Printed results not visible in debugger & notebooks [#296](https://github.com/apache/datafusion-python/pull/296) (simicd) +- add package.include and remove wildcard dependency [#295](https://github.com/apache/datafusion-python/pull/295) (andygrove) +- Update main branch name in docs workflow [#303](https://github.com/apache/datafusion-python/pull/303) (andygrove) +- Upgrade to DF 21 [#301](https://github.com/apache/datafusion-python/pull/301) (andygrove) + +## [20.0.0](https://github.com/apache/datafusion-python/tree/20.0.0) (2023-03-17) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.8.0...20.0.0) + +**Implemented enhancements:** + +- Empty relation bindings [#208](https://github.com/apache/datafusion-python/pull/208) (jdye64) +- wrap display_name and canonical_name functions [#214](https://github.com/apache/datafusion-python/pull/214) (jdye64) +- Add PyAlias bindings [#216](https://github.com/apache/datafusion-python/pull/216) (jdye64) +- Add bindings for scalar_variable [#218](https://github.com/apache/datafusion-python/pull/218) (jdye64) +- Bindings for LIKE type expressions [#220](https://github.com/apache/datafusion-python/pull/220) (jdye64) +- Bool expr bindings [#223](https://github.com/apache/datafusion-python/pull/223) (jdye64) +- Between bindings [#229](https://github.com/apache/datafusion-python/pull/229) (jdye64) +- Add bindings for GetIndexedField [#227](https://github.com/apache/datafusion-python/pull/227) (jdye64) +- Add bindings for case, cast, and trycast [#232](https://github.com/apache/datafusion-python/pull/232) (jdye64) +- add remaining expr bindings [#233](https://github.com/apache/datafusion-python/pull/233) (jdye64) +- feature: Additional export methods [#236](https://github.com/apache/datafusion-python/pull/236) (simicd) +- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/datafusion-python/pull/240) (iajoiner) +- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/datafusion-python/pull/242) (simicd) +- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/datafusion-python/pull/246) (iajoiner) +- feature: Set table name from ctx functions [#260](https://github.com/apache/datafusion-python/pull/260) (simicd) +- Explain bindings [#264](https://github.com/apache/datafusion-python/pull/264) (jdye64) +- Extension bindings [#266](https://github.com/apache/datafusion-python/pull/266) (jdye64) +- Subquery alias bindings [#269](https://github.com/apache/datafusion-python/pull/269) (jdye64) +- Create memory table [#271](https://github.com/apache/datafusion-python/pull/271) (jdye64) +- Create view bindings [#273](https://github.com/apache/datafusion-python/pull/273) (jdye64) +- Re-export Datafusion dependencies [#277](https://github.com/apache/datafusion-python/pull/277) (jdye64) +- Distinct bindings [#275](https://github.com/apache/datafusion-python/pull/275) (jdye64) +- Drop table bindings [#283](https://github.com/apache/datafusion-python/pull/283) (jdye64) +- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/datafusion-python/pull/285) (jdye64) +- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/datafusion-python/pull/287) (jdye64) + +**Documentation updates:** + +- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/datafusion-python/pull/258) (simicd) + +**Merged pull requests:** + +- Minor docs updates [#210](https://github.com/apache/datafusion-python/pull/210) (andygrove) +- Empty relation bindings [#208](https://github.com/apache/datafusion-python/pull/208) (jdye64) +- wrap display_name and canonical_name functions [#214](https://github.com/apache/datafusion-python/pull/214) (jdye64) +- Add PyAlias bindings [#216](https://github.com/apache/datafusion-python/pull/216) (jdye64) +- Add bindings for scalar_variable [#218](https://github.com/apache/datafusion-python/pull/218) (jdye64) +- Bindings for LIKE type expressions [#220](https://github.com/apache/datafusion-python/pull/220) (jdye64) +- Bool expr bindings [#223](https://github.com/apache/datafusion-python/pull/223) (jdye64) +- Between bindings [#229](https://github.com/apache/datafusion-python/pull/229) (jdye64) +- Add bindings for GetIndexedField [#227](https://github.com/apache/datafusion-python/pull/227) (jdye64) +- Add bindings for case, cast, and trycast [#232](https://github.com/apache/datafusion-python/pull/232) (jdye64) +- add remaining expr bindings [#233](https://github.com/apache/datafusion-python/pull/233) (jdye64) +- Pre-commit hooks [#228](https://github.com/apache/datafusion-python/pull/228) (jdye64) +- Implement new release process [#149](https://github.com/apache/datafusion-python/pull/149) (andygrove) +- feature: Additional export methods [#236](https://github.com/apache/datafusion-python/pull/236) (simicd) +- Add Python wrapper for LogicalPlan::Union [#240](https://github.com/apache/datafusion-python/pull/240) (iajoiner) +- feature: Create dataframe from pandas, polars, dictionary, list or pyarrow Table [#242](https://github.com/apache/datafusion-python/pull/242) (simicd) +- Fix release instructions [#238](https://github.com/apache/datafusion-python/pull/238) (andygrove) +- Add Python wrappers for `LogicalPlan::Join` and `LogicalPlan::CrossJoin` [#246](https://github.com/apache/datafusion-python/pull/246) (iajoiner) +- docs: Example of calling Python UDF & UDAF in SQL [#258](https://github.com/apache/datafusion-python/pull/258) (simicd) +- feature: Set table name from ctx functions [#260](https://github.com/apache/datafusion-python/pull/260) (simicd) +- Upgrade to DataFusion 19 [#262](https://github.com/apache/datafusion-python/pull/262) (andygrove) +- Explain bindings [#264](https://github.com/apache/datafusion-python/pull/264) (jdye64) +- Extension bindings [#266](https://github.com/apache/datafusion-python/pull/266) (jdye64) +- Subquery alias bindings [#269](https://github.com/apache/datafusion-python/pull/269) (jdye64) +- Create memory table [#271](https://github.com/apache/datafusion-python/pull/271) (jdye64) +- Create view bindings [#273](https://github.com/apache/datafusion-python/pull/273) (jdye64) +- Re-export Datafusion dependencies [#277](https://github.com/apache/datafusion-python/pull/277) (jdye64) +- Distinct bindings [#275](https://github.com/apache/datafusion-python/pull/275) (jdye64) +- build(deps): bump actions/checkout from 2 to 3 [#244](https://github.com/apache/datafusion-python/pull/244) (dependabot[bot]) +- build(deps): bump actions/upload-artifact from 2 to 3 [#245](https://github.com/apache/datafusion-python/pull/245) (dependabot[bot]) +- build(deps): bump actions/download-artifact from 2 to 3 [#243](https://github.com/apache/datafusion-python/pull/243) (dependabot[bot]) +- Use DataFusion 20 [#278](https://github.com/apache/datafusion-python/pull/278) (andygrove) +- Drop table bindings [#283](https://github.com/apache/datafusion-python/pull/283) (jdye64) +- Bindings for LogicalPlan::Repartition [#285](https://github.com/apache/datafusion-python/pull/285) (jdye64) +- Expand Rust return type support for Arrow DataTypes in ScalarValue [#287](https://github.com/apache/datafusion-python/pull/287) (jdye64) + +## [0.8.0](https://github.com/apache/datafusion-python/tree/0.8.0) (2023-02-22) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.8.0-rc1...0.8.0) + +**Implemented enhancements:** + +- Add support for cuDF physical execution engine [\#202](https://github.com/apache/datafusion-python/issues/202) +- Make it easier to create a Pandas dataframe from DataFusion query results [\#139](https://github.com/apache/datafusion-python/issues/139) + +**Fixed bugs:** + +- Build error: could not compile `thiserror` due to 2 previous errors [\#69](https://github.com/apache/datafusion-python/issues/69) + +**Closed issues:** + +- Integrate with the new `object_store` crate [\#22](https://github.com/apache/datafusion-python/issues/22) + +**Merged pull requests:** + +- Update README in preparation for 0.8 release [\#206](https://github.com/apache/datafusion-python/pull/206) ([andygrove](https://github.com/andygrove)) +- Add support for cudf as a physical execution engine [\#205](https://github.com/apache/datafusion-python/pull/205) ([jdye64](https://github.com/jdye64)) +- Run `maturin develop` instead of `cargo build` in verification script [\#200](https://github.com/apache/datafusion-python/pull/200) ([andygrove](https://github.com/andygrove)) +- Add tests for recently added functionality [\#199](https://github.com/apache/datafusion-python/pull/199) ([andygrove](https://github.com/andygrove)) +- Implement `to_pandas()` [\#197](https://github.com/apache/datafusion-python/pull/197) ([simicd](https://github.com/simicd)) +- Add Python wrapper for LogicalPlan::Sort [\#196](https://github.com/apache/datafusion-python/pull/196) ([andygrove](https://github.com/andygrove)) +- Add Python wrapper for LogicalPlan::Aggregate [\#195](https://github.com/apache/datafusion-python/pull/195) ([andygrove](https://github.com/andygrove)) +- Add Python wrapper for LogicalPlan::Limit [\#193](https://github.com/apache/datafusion-python/pull/193) ([andygrove](https://github.com/andygrove)) +- Add Python wrapper for LogicalPlan::Filter [\#192](https://github.com/apache/datafusion-python/pull/192) ([andygrove](https://github.com/andygrove)) +- Add experimental support for executing SQL with Polars and Pandas [\#190](https://github.com/apache/datafusion-python/pull/190) ([andygrove](https://github.com/andygrove)) +- Update changelog for 0.8 release [\#188](https://github.com/apache/datafusion-python/pull/188) ([andygrove](https://github.com/andygrove)) +- Add ability to execute ExecutionPlan and get a stream of RecordBatch [\#186](https://github.com/apache/datafusion-python/pull/186) ([andygrove](https://github.com/andygrove)) +- Dffield bindings [\#185](https://github.com/apache/datafusion-python/pull/185) ([jdye64](https://github.com/jdye64)) +- Add bindings for DFSchema [\#183](https://github.com/apache/datafusion-python/pull/183) ([jdye64](https://github.com/jdye64)) +- test: Window functions [\#182](https://github.com/apache/datafusion-python/pull/182) ([simicd](https://github.com/simicd)) +- Add bindings for Projection [\#180](https://github.com/apache/datafusion-python/pull/180) ([jdye64](https://github.com/jdye64)) +- Table scan bindings [\#178](https://github.com/apache/datafusion-python/pull/178) ([jdye64](https://github.com/jdye64)) +- Make session configurable [\#176](https://github.com/apache/datafusion-python/pull/176) ([andygrove](https://github.com/andygrove)) +- Upgrade to DataFusion 18.0.0 [\#175](https://github.com/apache/datafusion-python/pull/175) ([andygrove](https://github.com/andygrove)) +- Use latest DataFusion rev in preparation for DF 18 release [\#174](https://github.com/apache/datafusion-python/pull/174) ([andygrove](https://github.com/andygrove)) +- Arrow type bindings [\#173](https://github.com/apache/datafusion-python/pull/173) ([jdye64](https://github.com/jdye64)) +- Pyo3 bump [\#171](https://github.com/apache/datafusion-python/pull/171) ([jdye64](https://github.com/jdye64)) +- feature: Add additional aggregation functions [\#170](https://github.com/apache/datafusion-python/pull/170) ([simicd](https://github.com/simicd)) +- Make from_substrait_plan return DataFrame instead of LogicalPlan [\#164](https://github.com/apache/datafusion-python/pull/164) ([andygrove](https://github.com/andygrove)) +- feature: Implement count method [\#163](https://github.com/apache/datafusion-python/pull/163) ([simicd](https://github.com/simicd)) +- CI Fixes [\#162](https://github.com/apache/datafusion-python/pull/162) ([jdye64](https://github.com/jdye64)) +- Upgrade to DataFusion 17 [\#160](https://github.com/apache/datafusion-python/pull/160) ([andygrove](https://github.com/andygrove)) +- feature: Improve string representation of datafusion classes [\#159](https://github.com/apache/datafusion-python/pull/159) ([simicd](https://github.com/simicd)) +- Make PyExecutionPlan.plan public [\#156](https://github.com/apache/datafusion-python/pull/156) ([andygrove](https://github.com/andygrove)) +- Expose methods on logical and execution plans [\#155](https://github.com/apache/datafusion-python/pull/155) ([andygrove](https://github.com/andygrove)) +- Fix clippy for new Rust version [\#154](https://github.com/apache/datafusion-python/pull/154) ([andygrove](https://github.com/andygrove)) +- Add DataFrame methods for accessing plans [\#153](https://github.com/apache/datafusion-python/pull/153) ([andygrove](https://github.com/andygrove)) +- Use DataFusion rev 5238e8c97f998b4d2cb9fab85fb182f325a1a7fb [\#150](https://github.com/apache/datafusion-python/pull/150) ([andygrove](https://github.com/andygrove)) +- build\(deps\): bump async-trait from 0.1.61 to 0.1.62 [\#148](https://github.com/apache/datafusion-python/pull/148) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Rename default branch from master to main [\#147](https://github.com/apache/datafusion-python/pull/147) ([andygrove](https://github.com/andygrove)) +- Substrait bindings [\#145](https://github.com/apache/datafusion-python/pull/145) ([jdye64](https://github.com/jdye64)) +- build\(deps\): bump uuid from 0.8.2 to 1.2.2 [\#143](https://github.com/apache/datafusion-python/pull/143) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Prepare for 0.8.0 release [\#141](https://github.com/apache/datafusion-python/pull/141) ([andygrove](https://github.com/andygrove)) +- Improve README and add more examples [\#137](https://github.com/apache/datafusion-python/pull/137) ([andygrove](https://github.com/andygrove)) +- test: Expand tests for built-in functions [\#129](https://github.com/apache/datafusion-python/pull/129) ([simicd](https://github.com/simicd)) +- build\(deps\): bump object_store from 0.5.2 to 0.5.3 [\#126](https://github.com/apache/datafusion-python/pull/126) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump mimalloc from 0.1.32 to 0.1.34 [\#125](https://github.com/apache/datafusion-python/pull/125) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Introduce conda directory containing datafusion-dev.yaml conda enviro… [\#124](https://github.com/apache/datafusion-python/pull/124) ([jdye64](https://github.com/jdye64)) +- build\(deps\): bump bzip2 from 0.4.3 to 0.4.4 [\#121](https://github.com/apache/datafusion-python/pull/121) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump tokio from 1.23.0 to 1.24.1 [\#119](https://github.com/apache/datafusion-python/pull/119) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump async-trait from 0.1.60 to 0.1.61 [\#118](https://github.com/apache/datafusion-python/pull/118) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Upgrade to DataFusion 16.0.0 [\#115](https://github.com/apache/datafusion-python/pull/115) ([andygrove](https://github.com/andygrove)) +- Bump async-trait from 0.1.57 to 0.1.60 [\#114](https://github.com/apache/datafusion-python/pull/114) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump object_store from 0.5.1 to 0.5.2 [\#112](https://github.com/apache/datafusion-python/pull/112) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Bump tokio from 1.21.2 to 1.23.0 [\#109](https://github.com/apache/datafusion-python/pull/109) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Add entries for publishing production \(asf-site\) and staging docs [\#107](https://github.com/apache/datafusion-python/pull/107) ([martin-g](https://github.com/martin-g)) +- Add a workflow that builds the docs and deploys them at staged or production [\#104](https://github.com/apache/datafusion-python/pull/104) ([martin-g](https://github.com/martin-g)) +- Upgrade to DataFusion 15.0.0 [\#103](https://github.com/apache/datafusion-python/pull/103) ([andygrove](https://github.com/andygrove)) +- build\(deps\): bump futures from 0.3.24 to 0.3.25 [\#102](https://github.com/apache/datafusion-python/pull/102) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump pyo3 from 0.17.2 to 0.17.3 [\#101](https://github.com/apache/datafusion-python/pull/101) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump mimalloc from 0.1.30 to 0.1.32 [\#98](https://github.com/apache/datafusion-python/pull/98) ([dependabot[bot]](https://github.com/apps/dependabot)) +- build\(deps\): bump rand from 0.7.3 to 0.8.5 [\#97](https://github.com/apache/datafusion-python/pull/97) ([dependabot[bot]](https://github.com/apps/dependabot)) +- Fix GitHub actions warnings [\#95](https://github.com/apache/datafusion-python/pull/95) ([martin-g](https://github.com/martin-g)) +- Fixes \#81 - Add CI workflow for source distribution [\#93](https://github.com/apache/datafusion-python/pull/93) ([martin-g](https://github.com/martin-g)) +- post-release updates [\#91](https://github.com/apache/datafusion-python/pull/91) ([andygrove](https://github.com/andygrove)) +- Build for manylinux 2014 [\#88](https://github.com/apache/datafusion-python/pull/88) ([martin-g](https://github.com/martin-g)) +- update release readme tag [\#86](https://github.com/apache/datafusion-python/pull/86) ([Jimexist](https://github.com/Jimexist)) +- Upgrade Maturin to 0.14.2 [\#85](https://github.com/apache/datafusion-python/pull/85) ([martin-g](https://github.com/martin-g)) +- Update release instructions [\#83](https://github.com/apache/datafusion-python/pull/83) ([andygrove](https://github.com/andygrove)) +- \[Functions\] - Add python function binding to `functions` [\#73](https://github.com/apache/datafusion-python/pull/73) ([francis-du](https://github.com/francis-du)) + +## [0.8.0-rc1](https://github.com/apache/datafusion-python/tree/0.8.0-rc1) (2023-02-17) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.7.0-rc2...0.8.0-rc1) + +**Implemented enhancements:** + +- Add bindings for datafusion_common::DFField [\#184](https://github.com/apache/datafusion-python/issues/184) +- Add bindings for DFSchema/DFSchemaRef [\#181](https://github.com/apache/datafusion-python/issues/181) +- Add bindings for datafusion_expr Projection [\#179](https://github.com/apache/datafusion-python/issues/179) +- Add bindings for `TableScan` struct from `datafusion_expr::TableScan` [\#177](https://github.com/apache/datafusion-python/issues/177) +- Add a "mapping" struct for types [\#172](https://github.com/apache/datafusion-python/issues/172) +- Improve string representation of datafusion classes \(dataframe, context, expression, ...\) [\#158](https://github.com/apache/datafusion-python/issues/158) +- Add DataFrame count method [\#151](https://github.com/apache/datafusion-python/issues/151) +- \[REQUEST\] Github Actions Improvements [\#146](https://github.com/apache/datafusion-python/issues/146) +- Change default branch name from master to main [\#144](https://github.com/apache/datafusion-python/issues/144) +- Bump pyo3 to 0.18.0 [\#140](https://github.com/apache/datafusion-python/issues/140) +- Add script for Python linting [\#134](https://github.com/apache/datafusion-python/issues/134) +- Add Python bindings for substrait module [\#132](https://github.com/apache/datafusion-python/issues/132) +- Expand unit tests for built-in functions [\#128](https://github.com/apache/datafusion-python/issues/128) +- support creating arrow-datafusion-python conda environment [\#122](https://github.com/apache/datafusion-python/issues/122) +- Build Python source distribution in GitHub workflow [\#81](https://github.com/apache/datafusion-python/issues/81) +- EPIC: Add all functions to python binding `functions` [\#72](https://github.com/apache/datafusion-python/issues/72) + +**Fixed bugs:** + +- Build is broken [\#161](https://github.com/apache/datafusion-python/issues/161) +- Out of memory when sorting [\#157](https://github.com/apache/datafusion-python/issues/157) +- window_lead test appears to be non-deterministic [\#135](https://github.com/apache/datafusion-python/issues/135) +- Reading csv does not work [\#130](https://github.com/apache/datafusion-python/issues/130) +- Github actions produce a lot of warnings [\#94](https://github.com/apache/datafusion-python/issues/94) +- ASF source release tarball has wrong directory name [\#90](https://github.com/apache/datafusion-python/issues/90) +- Python Release Build failing after upgrading to maturin 14.2 [\#87](https://github.com/apache/datafusion-python/issues/87) +- Maturin build hangs on Linux ARM64 [\#84](https://github.com/apache/datafusion-python/issues/84) +- Cannot install on Mac M1 from source tarball from testpypi [\#82](https://github.com/apache/datafusion-python/issues/82) +- ImportPathMismatchError when running pytest locally [\#77](https://github.com/apache/datafusion-python/issues/77) + +**Closed issues:** + +- Publish documentation for Python bindings [\#39](https://github.com/apache/datafusion-python/issues/39) +- Add Python binding for `approx_median` [\#32](https://github.com/apache/datafusion-python/issues/32) +- Release version 0.7.0 [\#7](https://github.com/apache/datafusion-python/issues/7) + +## [0.7.0-rc2](https://github.com/apache/datafusion-python/tree/0.7.0-rc2) (2022-11-26) + +[Full Changelog](https://github.com/apache/datafusion-python/compare/0.7.0...0.7.0-rc2) + +## [Unreleased](https://github.com/datafusion-contrib/datafusion-python/tree/HEAD) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1...HEAD) + +**Merged pull requests:** + +- use \_\_getitem\_\_ for df column selection [\#41](https://github.com/datafusion-contrib/datafusion-python/pull/41) ([Jimexist](https://github.com/Jimexist)) +- fix demo in readme [\#40](https://github.com/datafusion-contrib/datafusion-python/pull/40) ([Jimexist](https://github.com/Jimexist)) +- Implement select_columns [\#39](https://github.com/datafusion-contrib/datafusion-python/pull/39) ([andygrove](https://github.com/andygrove)) +- update readme and changelog [\#38](https://github.com/datafusion-contrib/datafusion-python/pull/38) ([Jimexist](https://github.com/Jimexist)) +- Add PyDataFrame.explain [\#36](https://github.com/datafusion-contrib/datafusion-python/pull/36) ([andygrove](https://github.com/andygrove)) +- Release 0.5.0 [\#34](https://github.com/datafusion-contrib/datafusion-python/pull/34) ([Jimexist](https://github.com/Jimexist)) +- disable nightly in workflow [\#33](https://github.com/datafusion-contrib/datafusion-python/pull/33) ([Jimexist](https://github.com/Jimexist)) +- update requirements to 37 and 310, update readme [\#32](https://github.com/datafusion-contrib/datafusion-python/pull/32) ([Jimexist](https://github.com/Jimexist)) +- Add custom global allocator [\#30](https://github.com/datafusion-contrib/datafusion-python/pull/30) ([matthewmturner](https://github.com/matthewmturner)) +- Remove pandas dependency [\#25](https://github.com/datafusion-contrib/datafusion-python/pull/25) ([matthewmturner](https://github.com/matthewmturner)) +- upgrade datafusion and pyo3 [\#20](https://github.com/datafusion-contrib/datafusion-python/pull/20) ([Jimexist](https://github.com/Jimexist)) +- update maturin 0.12+ [\#17](https://github.com/datafusion-contrib/datafusion-python/pull/17) ([Jimexist](https://github.com/Jimexist)) +- Update README.md [\#16](https://github.com/datafusion-contrib/datafusion-python/pull/16) ([Jimexist](https://github.com/Jimexist)) +- apply cargo clippy --fix [\#15](https://github.com/datafusion-contrib/datafusion-python/pull/15) ([Jimexist](https://github.com/Jimexist)) +- update test workflow to include rust clippy and check [\#14](https://github.com/datafusion-contrib/datafusion-python/pull/14) ([Jimexist](https://github.com/Jimexist)) +- use maturin 0.12.6 [\#13](https://github.com/datafusion-contrib/datafusion-python/pull/13) ([Jimexist](https://github.com/Jimexist)) +- apply cargo fmt [\#12](https://github.com/datafusion-contrib/datafusion-python/pull/12) ([Jimexist](https://github.com/Jimexist)) +- use stable not nightly [\#11](https://github.com/datafusion-contrib/datafusion-python/pull/11) ([Jimexist](https://github.com/Jimexist)) +- ci: test against more compilers, setup clippy and fix clippy lints [\#9](https://github.com/datafusion-contrib/datafusion-python/pull/9) ([cpcloud](https://github.com/cpcloud)) +- Fix use of importlib.metadata and unify requirements.txt [\#8](https://github.com/datafusion-contrib/datafusion-python/pull/8) ([cpcloud](https://github.com/cpcloud)) +- Ship the Cargo.lock file in the source distribution [\#7](https://github.com/datafusion-contrib/datafusion-python/pull/7) ([cpcloud](https://github.com/cpcloud)) +- add \_\_version\_\_ attribute to datafusion object [\#3](https://github.com/datafusion-contrib/datafusion-python/pull/3) ([tfeda](https://github.com/tfeda)) +- fix ci by fixing directories [\#2](https://github.com/datafusion-contrib/datafusion-python/pull/2) ([Jimexist](https://github.com/Jimexist)) +- setup workflow [\#1](https://github.com/datafusion-contrib/datafusion-python/pull/1) ([Jimexist](https://github.com/Jimexist)) + +## [0.5.1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1) (2022-03-15) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.1-rc1...0.5.1) + +## [0.5.1-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.1-rc1) (2022-03-15) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0...0.5.1-rc1) + +## [0.5.0](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0) (2022-03-10) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc2...0.5.0) + +## [0.5.0-rc2](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc2) (2022-03-10) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc1...0.5.0-rc2) + +**Closed issues:** + +- Add support for Ballista [\#37](https://github.com/datafusion-contrib/datafusion-python/issues/37) +- Implement DataFrame.explain [\#35](https://github.com/datafusion-contrib/datafusion-python/issues/35) + +## [0.5.0-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc1) (2022-03-09) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/4c98b8e9c3c3f8e2e6a8f2d1ffcfefda344c4680...0.5.0-rc1) + +**Closed issues:** + +- Investigate exposing additional optimizations [\#28](https://github.com/datafusion-contrib/datafusion-python/issues/28) +- Use custom allocator in Python build [\#27](https://github.com/datafusion-contrib/datafusion-python/issues/27) +- Why is pandas a requirement? [\#24](https://github.com/datafusion-contrib/datafusion-python/issues/24) +- Unable to build [\#18](https://github.com/datafusion-contrib/datafusion-python/issues/18) +- Setup CI against multiple Python version [\#6](https://github.com/datafusion-contrib/datafusion-python/issues/6) diff --git a/dev/check_crates_patch.py b/dev/check_crates_patch.py new file mode 100644 index 000000000..74e489e1f --- /dev/null +++ b/dev/check_crates_patch.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Check that no Cargo.toml files contain [patch.crates-io] entries. + +Release builds must not depend on patched crates. During development it is +common to temporarily patch crates-io dependencies, but those patches must +be removed before creating a release. + +An empty [patch.crates-io] section is allowed. +""" + +import sys +from pathlib import Path + +import tomllib + + +def main() -> int: + errors: list[str] = [] + for cargo_toml in sorted(Path().rglob("Cargo.toml")): + if "target" in cargo_toml.parts: + continue + with Path.open(cargo_toml, "rb") as f: + data = tomllib.load(f) + patch = data.get("patch", {}).get("crates-io", {}) + if patch: + errors.append(str(cargo_toml)) + for name, spec in patch.items(): + errors.append(f" {name} = {spec}") + + if errors: + print("ERROR: Release builds must not contain [patch.crates-io] entries.") + print() + for line in errors: + print(line) + print() + print("Remove all [patch.crates-io] entries before creating a release.") + return 1 + + print("OK: No [patch.crates-io] entries found.") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/dev/clean.sh b/dev/clean.sh new file mode 100755 index 000000000..0d86680e8 --- /dev/null +++ b/dev/clean.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This cleans up the project by removing build artifacts and other generated files. + +# Function to remove a directory and print the action +remove_dir() { + if [ -d "$1" ]; then + echo "Removing directory: $1" + rm -rf "$1" + fi +} + +# Function to remove a file and print the action +remove_file() { + if [ -f "$1" ]; then + echo "Removing file: $1" + rm -f "$1" + fi +} + +# Remove .pytest_cache directory +remove_dir .pytest_cache/ + +# Remove target directory +remove_dir target/ + +# Remove any __pycache__ directories +find python/ -type d -name "__pycache__" -print | while read -r dir; do + remove_dir "$dir" +done + +# Remove pytest-coverage.lcov file +# remove_file .coverage +# remove_file pytest-coverage.lcov + +# Remove rust-coverage.lcov file +# remove_file rust-coverage.lcov + +# Remove pyo3 files +find python/ -type f -name '_internal.*.so' -print | while read -r file; do + remove_file "$file" +done + +echo "Cleanup complete." \ No newline at end of file diff --git a/dev/create_license.py b/dev/create_license.py index 2a67cb8fd..acbf8587c 100644 --- a/dev/create_license.py +++ b/dev/create_license.py @@ -20,12 +20,11 @@ import json import subprocess +from pathlib import Path -subprocess.check_output(["cargo", "install", "cargo-license"]) data = subprocess.check_output( [ - "cargo", - "license", + "cargo-license", "--avoid-build-deps", "--avoid-dev-deps", "--do-not-bundle", @@ -248,5 +247,5 @@ result += "------------------\n\n" result += f"### {name} {version}\n* source: [{repository}]({repository})\n* license: {license}\n\n" -with open("LICENSE.txt", "w") as f: +with Path.open("LICENSE.txt", "w") as f: f.write(result) diff --git a/dev/python_lint.sh b/dev/python_lint.sh index 949346294..2d867f29d 100755 --- a/dev/python_lint.sh +++ b/dev/python_lint.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -21,6 +21,6 @@ # DataFusion CI does set -e -source venv/bin/activate -flake8 --exclude venv --ignore=E501,W503 +source .venv/bin/activate +flake8 --exclude venv,benchmarks/db-benchmark --ignore=E501,W503 black --line-length 79 . diff --git a/dev/release/README.md b/dev/release/README.md index cec0eef5e..ed28f4aa6 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -56,33 +56,43 @@ Before creating a new release: - a PR should be created and merged to update the major version number of the project - A new release branch should be created, such as `branch-0.8` -### Update CHANGELOG.md +## Preparing a Release Candidate -Define release branch (e.g. `branch-0.8`), base version tag (e.g. `0.7.0`) and future version tag (e.g. `0.9.0`). Commits -between the base version tag and the release branch will be used to populate the changelog content. +### Change Log + +We maintain a `CHANGELOG.md` so our users know what has been changed between releases. + +The changelog is generated using a Python script: ```bash -# create the changelog -CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log-datafusion-python.sh main 0.8.0 0.7.0 -# review change log / edit issues and labels if needed, rerun until you are happy with the result -git commit -a -m 'Create changelog for release' +$ GITHUB_TOKEN= ./dev/release/generate-changelog.py 24.0.0 HEAD 25.0.0 > dev/changelog/25.0.0.md ``` -_If you see the error `"You have exceeded a secondary rate limit"` when running this script, try reducing the CPU -allocation to slow the process down and throttle the number of GitHub requests made per minute, by modifying the -value of the `--cpus` argument in the `update_change_log.sh` script._ +This script creates a changelog from GitHub PRs based on the labels associated with them as well as looking for +titles starting with `feat:`, `fix:`, or `docs:` . The script will produce output similar to: -You can add `invalid` or `development-process` label to exclude items from -release notes. +``` +Fetching list of commits between 24.0.0 and HEAD +Fetching pull requests +Categorizing pull requests +Generating changelog content +``` -Send a PR to get these changes merged into the release branch (e.g. `branch-0.8`). If new commits that could change the -change log content landed in the release branch before you could merge the PR, you need to rerun the changelog update -script to regenerate the changelog and update the PR accordingly. +### Update the version number -### Preparing a Release Candidate +The only place you should need to update the version is in the root `Cargo.toml`. +After updating the toml file, run `cargo update` to update the cargo lock file. +If you do not want to update all the dependencies, you can instead run `cargo build` +which should only update the version number for `datafusion-python`. ### Tag the Repository +Commit the changes to the changelog and version. + +Assuming you have set up a remote to the `apache` repository rather than your personal fork, +you need to push a tag to start the CI process for release candidates. The following assumes +the upstream repository is called `apache`. + ```bash git tag 0.8.0-rc1 git push apache 0.8.0-rc1 @@ -94,42 +104,7 @@ git push apache 0.8.0-rc1 ./dev/release/create-tarball.sh 0.8.0 1 ``` -This will also create the email template to send to the mailing list. Here is an example: - -``` -To: dev@arrow.apache.org -Subject: [VOTE][RUST][DataFusion] Release DataFusion Python Bindings 0.7.0 RC2 -Hi, - -I would like to propose a release of Apache Arrow DataFusion Python Bindings, -version 0.7.0. - -This release candidate is based on commit: bd1b78b6d444b7ab172c6aec23fa58c842a592d7 [1] -The proposed release tarball and signatures are hosted at [2]. -The changelog is located at [3]. -The Python wheels are located at [4]. - -Please download, verify checksums and signatures, run the unit tests, and vote -on the release. The vote will be open for at least 72 hours. - -Only votes from PMC members are binding, but all members of the community are -encouraged to test the release and vote with "(non-binding)". - -The standard verification procedure is documented at https://github.com/apache/arrow-datafusion-python/blob/main/dev/release/README.md#verifying-release-candidates. - -[ ] +1 Release this as Apache Arrow DataFusion Python 0.7.0 -[ ] +0 -[ ] -1 Do not release this as Apache Arrow DataFusion Python 0.7.0 because... - -Here is my vote: - -+1 - -[1]: https://github.com/apache/arrow-datafusion-python/tree/bd1b78b6d444b7ab172c6aec23fa58c842a592d7 -[2]: https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-datafusion-python-0.7.0-rc2 -[3]: https://github.com/apache/arrow-datafusion-python/blob/bd1b78b6d444b7ab172c6aec23fa58c842a592d7/CHANGELOG.md -[4]: https://test.pypi.org/project/datafusion/0.7.0/ -``` +This will also create the email template to send to the mailing list. Create a draft email using this content, but do not send until after completing the next step. @@ -142,17 +117,25 @@ This section assumes some familiarity with publishing Python packages to PyPi. F Pushing an `rc` tag to the release branch will cause a GitHub Workflow to run that will build the Python wheels. -Go to https://github.com/apache/arrow-datafusion-python/actions and look for an action named "Python Release Build" +Go to https://github.com/apache/datafusion-python/actions and look for an action named "Python Release Build" that has run against the pushed tag. -Click on the action and scroll down to the bottom of the page titled "Artifacts". Download `dist.zip`. +Click on the action and scroll down to the bottom of the page titled "Artifacts". Download `dist.zip`. It should +contain files such as: + +```text +datafusion-22.0.0-cp37-abi3-macosx_10_7_x86_64.whl +datafusion-22.0.0-cp37-abi3-macosx_11_0_arm64.whl +datafusion-22.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +datafusion-22.0.0-cp37-abi3-win_amd64.whl +``` Upload the wheels to testpypi. ```bash unzip dist.zip python3 -m pip install --upgrade setuptools twine build -python3 -m twine upload --repository testpypi datafusion-0.7.0-cp37-abi3-*.whl +python3 -m twine upload --repository testpypi datafusion-22.0.0-cp37-abi3-*.whl ``` When prompted for username, enter `__token__`. When prompted for a password, enter a valid GitHub Personal Access Token @@ -162,7 +145,7 @@ When prompted for username, enter `__token__`. When prompted for a password, ent Download the source tarball created in the previous step, untar it, and run: ```bash -python3 -m build +maturin sdist ``` This will create a file named `dist/datafusion-0.7.0.tar.gz`. Upload this to testpypi: @@ -171,16 +154,62 @@ This will create a file named `dist/datafusion-0.7.0.tar.gz`. Upload this to tes python3 -m twine upload --repository testpypi dist/datafusion-0.7.0.tar.gz ``` +### Run Verify Release Candidate Workflow + +Before sending the vote email, run the manually triggered GitHub Actions workflow +"Verify Release Candidate" and confirm all matrix jobs pass across the OS/architecture matrix +(for example, Linux, macOS, and Windows runners): + +1. Go to https://github.com/apache/datafusion-python/actions/workflows/verify-release-candidate.yml +2. Click "Run workflow" +3. Set `version` to the release version (for example, `52.0.0`) +4. Set `rc_number` to the RC number (for example, `0`) +5. Wait for all jobs to complete successfully + +Include a short note in the vote email template that this workflow was run across all OS/architecture +matrix entries and that all jobs passed. + +```text +Verification note: The manually triggered "Verify Release Candidate" workflow was run for version and rc_number across all configured OS/architecture matrix entries, and all matrix jobs completed successfully. +``` + ### Send the Email Send the email to start the vote. ## Verifying a Release -Install the release from testpypi: +Releases may be verified using `verify-release-candidate.sh`: ```bash -pip install --extra-index-url https://test.pypi.org/simple/ datafusion==0.7.0 +git clone https://github.com/apache/datafusion-python.git +dev/release/verify-release-candidate.sh 48.0.0 1 +``` + +Alternatively, one can run unit tests against a testpypi release candidate: + +```bash +# clone a fresh repo +git clone https://github.com/apache/datafusion-python.git +cd datafusion-python + +# checkout the release commit +git fetch --tags +git checkout 40.0.0-rc1 +git submodule update --init --recursive + +# create the env +python3 -m venv .venv +source .venv/bin/activate + +# install release candidate +pip install --extra-index-url https://test.pypi.org/simple/ datafusion==40.0.0 + +# install test dependencies +pip install pytest numpy pytest-asyncio + +# run the tests +pytest --import-mode=importlib python/tests -vv ``` Try running one of the examples from the top-level README, or write some custom Python code to query some available @@ -198,6 +227,14 @@ Create the source release tarball: ./dev/release/release-tarball.sh 0.8.0 1 ``` +### Publishing Rust Crate to crates.io + +Some projects depend on the Rust crate directly, so we publish this to crates.io + +```shell +cargo publish +``` + ### Publishing Python Artifacts to PyPi Go to the Test PyPI page of Datafusion, and download @@ -208,33 +245,59 @@ uploading them using `twine`: twine upload --repository pypi dist-release/* ``` -### Publish Python Artifacts to Anaconda +### Publish Python Artifacts to conda-forge + +Pypi packages auto upload to conda-forge via [datafusion feedstock](https://github.com/conda-forge/datafusion-feedstock) -Publishing artifacts to Anaconda is similar to PyPi. First, Download the source tarball created in the previous step and untar it. +### Push the Release Tag ```bash -# Assuming you have an existing conda environment named `datafusion-dev` if not see root README for instructions -conda activate datafusion-dev -conda build . +git checkout 0.8.0-rc1 +git tag 0.8.0 +git push apache 0.8.0 ``` -This will setup a virtual conda environment and build the artifacts inside of that virtual env. This step can take a few minutes as the entire build, host, and runtime environments are setup. Once complete a local filesystem path will be emitted for the location of the resulting package. Observe that path and copy to your clipboard. +### Add the release to Apache Reporter + +Add the release to https://reporter.apache.org/addrelease.html?datafusion with a version name prefixed with `DATAFUSION-PYTHON`, +for example `DATAFUSION-PYTHON-31.0.0`. + +The release information is used to generate a template for a board report (see example from Apache Arrow +[here](https://github.com/apache/arrow/pull/14357)). + +### Delete old RCs and Releases + +See the ASF documentation on [when to archive](https://www.apache.org/legal/release-policy.html#when-to-archive) +for more information. -Ex: `/home/conda/envs/datafusion/conda-bld/linux-64/datafusion-0.7.0.tar.bz2` +#### Deleting old release candidates from `dev` svn -Now you are ready to publish this resulting package to anaconda.org. This can be accomplished in a few simple steps. +Release candidates should be deleted once the release is published. + +Get a list of DataFusion release candidates: ```bash -# First login to Anaconda with the datafusion credentials -anaconda login -# Upload the package -anaconda upload /home/conda/envs/datafusion/conda-bld/linux-64/datafusion-0.7.0.tar.bz2 +svn ls https://dist.apache.org/repos/dist/dev/datafusion | grep datafusion-python ``` -### Push the Release Tag +Delete a release candidate: ```bash -git checkout 0.8.0-rc1 -git tag 0.8.0 -git push apache 0.8.0 +svn delete -m "delete old DataFusion RC" https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-python-7.1.0-rc1/ +``` + +#### Deleting old releases from `release` svn + +Only the latest release should be available. Delete old releases after publishing the new release. + +Get a list of DataFusion releases: + +```bash +svn ls https://dist.apache.org/repos/dist/release/datafusion | grep datafusion-python +``` + +Delete a release: + +```bash +svn delete -m "delete old DataFusion release" https://dist.apache.org/repos/dist/release/datafusion/datafusion-python-7.0.0 ``` diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py index 30a01116b..72a35212e 100644 --- a/dev/release/check-rat-report.py +++ b/dev/release/check-rat-report.py @@ -21,17 +21,16 @@ import re import sys import xml.etree.ElementTree as ET +from pathlib import Path if len(sys.argv) != 3: - sys.stderr.write( - "Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0] - ) + sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0]) sys.exit(1) exclude_globs_filename = sys.argv[1] xml_filename = sys.argv[2] -globs = [line.strip() for line in open(exclude_globs_filename, "r")] +globs = [line.strip() for line in Path.open(exclude_globs_filename)] tree = ET.parse(xml_filename) root = tree.getroot() diff --git a/dev/release/create-tarball.sh b/dev/release/create-tarball.sh index c05da5b75..d6ca76561 100755 --- a/dev/release/create-tarball.sh +++ b/dev/release/create-tarball.sh @@ -21,9 +21,9 @@ # Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/create-tarball.sh # This script creates a signed tarball in -# dev/dist/apache-arrow-datafusion-python--.tar.gz and uploads it to +# dev/dist/apache-datafusion-python--.tar.gz and uploads it to # the "dev" area of the dist.apache.arrow repository and prepares an -# email for sending to the dev@arrow.apache.org list for a formal +# email for sending to the dev@datafusion.apache.org list for a formal # vote. # # See release/README.md for full release instructions @@ -65,25 +65,25 @@ tag="${version}-rc${rc}" echo "Attempting to create ${tarball} from tag ${tag}" release_hash=$(cd "${SOURCE_TOP_DIR}" && git rev-list --max-count=1 ${tag}) -release=apache-arrow-datafusion-python-${version} +release=apache-datafusion-python-${version} distdir=${SOURCE_TOP_DIR}/dev/dist/${release}-rc${rc} tarname=${release}.tar.gz tarball=${distdir}/${tarname} -url="https://dist.apache.org/repos/dist/dev/arrow/${release}-rc${rc}" +url="https://dist.apache.org/repos/dist/dev/datafusion/${release}-rc${rc}" if [ -z "$release_hash" ]; then echo "Cannot continue: unknown git tag: ${tag}" fi -echo "Draft email for dev@arrow.apache.org mailing list" +echo "Draft email for dev@datafusion.apache.org mailing list" echo "" echo "---------------------------------------------------------" cat < ${tarball}.sha256 (cd ${distdir} && shasum -a 512 ${tarname}) > ${tarball}.sha512 -echo "Uploading to apache dist/dev to ${url}" -svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow ${SOURCE_TOP_DIR}/dev/dist +echo "Uploading to datafusion dist/dev to ${url}" +svn co --depth=empty https://dist.apache.org/repos/dist/dev/datafusion ${SOURCE_TOP_DIR}/dev/dist svn add ${distdir} -svn ci -m "Apache Arrow DataFusion Python ${version} ${rc}" ${distdir} +svn ci -m "Apache DataFusion Python ${version} ${rc}" ${distdir} diff --git a/dev/release/generate-changelog.py b/dev/release/generate-changelog.py new file mode 100755 index 000000000..d86736773 --- /dev/null +++ b/dev/release/generate-changelog.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import re +import subprocess +import sys + +from github import Github + + +def print_pulls(repo_name, title, pulls) -> None: + if len(pulls) > 0: + print(f"**{title}:**") + print() + for pull, commit in pulls: + url = f"https://github.com/{repo_name}/pull/{pull.number}" + print(f"- {pull.title} [#{pull.number}]({url}) ({commit.author.login})") + print() + + +def generate_changelog(repo, repo_name, tag1, tag2, version) -> None: + # get a list of commits between two tags + print(f"Fetching list of commits between {tag1} and {tag2}", file=sys.stderr) + comparison = repo.compare(tag1, tag2) + + # get the pull requests for these commits + print("Fetching pull requests", file=sys.stderr) + unique_pulls = [] + all_pulls = [] + for commit in comparison.commits: + pulls = commit.get_pulls() + for pull in pulls: + # there can be multiple commits per PR if squash merge is not being used and + # in this case we should get all the author names, but for now just pick one + if pull.number not in unique_pulls: + unique_pulls.append(pull.number) + all_pulls.append((pull, commit)) + + # we split the pulls into categories + breaking = [] + bugs = [] + docs = [] + enhancements = [] + performance = [] + other = [] + + # categorize the pull requests based on GitHub labels + print("Categorizing pull requests", file=sys.stderr) + for pull, commit in all_pulls: + # see if PR title uses Conventional Commits + cc_type = "" + cc_breaking = "" + parts = re.findall(r"^([a-z]+)(\([a-z]+\))?(!)?:", pull.title) + if len(parts) == 1: + parts_tuple = parts[0] + cc_type = parts_tuple[0] # fix, feat, docs, chore + # cc_scope = parts_tuple[1] # component within project + cc_breaking = parts_tuple[2] == "!" + + labels = [label.name for label in pull.labels] + if "api change" in labels or cc_breaking: + breaking.append((pull, commit)) + elif "bug" in labels or cc_type == "fix": + bugs.append((pull, commit)) + elif "performance" in labels or cc_type == "perf": + performance.append((pull, commit)) + elif "enhancement" in labels or cc_type == "feat": + enhancements.append((pull, commit)) + elif "documentation" in labels or cc_type == "docs" or cc_type == "doc": + docs.append((pull, commit)) + else: + other.append((pull, commit)) + + # produce the changelog content + print("Generating changelog content", file=sys.stderr) + + # ASF header + print("""\n""") + + print(f"# Apache DataFusion Python {version} Changelog\n") + + # get the number of commits + commit_count = subprocess.check_output( + f"git log --pretty=oneline {tag1}..{tag2} | wc -l", shell=True, text=True + ).strip() + + # get number of contributors + contributor_count = subprocess.check_output( + f"git shortlog -sn {tag1}..{tag2} | wc -l", shell=True, text=True + ).strip() + + print( + f"This release consists of {commit_count} commits from {contributor_count} contributors. " + f"See credits at the end of this changelog for more information.\n" + ) + + print_pulls(repo_name, "Breaking changes", breaking) + print_pulls(repo_name, "Performance related", performance) + print_pulls(repo_name, "Implemented enhancements", enhancements) + print_pulls(repo_name, "Fixed bugs", bugs) + print_pulls(repo_name, "Documentation updates", docs) + print_pulls(repo_name, "Other", other) + + # show code contributions + credits = subprocess.check_output( + f"git shortlog -sn {tag1}..{tag2}", shell=True, text=True + ).rstrip() + + print("## Credits\n") + print( + "Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) " + "per contributor.\n" + ) + print("```") + print(credits) + print("```\n") + + print( + "Thank you also to everyone who contributed in other ways such as filing issues, reviewing " + "PRs, and providing feedback on this release.\n" + ) + + +def cli(args=None) -> None: + """Process command line arguments.""" + if not args: + args = sys.argv[1:] + + parser = argparse.ArgumentParser() + parser.add_argument("tag1", help="The previous commit or tag (e.g. 0.1.0)") + parser.add_argument("tag2", help="The current commit or tag (e.g. HEAD)") + parser.add_argument( + "version", help="The version number to include in the changelog" + ) + args = parser.parse_args() + + token = os.getenv("GITHUB_TOKEN") + project = "apache/datafusion-python" + + g = Github(token) + repo = g.get_repo(project) + generate_changelog(repo, project, args.tag1, args.tag2, args.version) + + +if __name__ == "__main__": + cli() diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index db5379d89..dcd5d9aac 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -42,4 +42,9 @@ Cargo.lock .history *rat.txt */.git -docs.yaml \ No newline at end of file +.github/* +benchmarks/tpch/queries/q*.sql +benchmarks/tpch/create_tables.sql +.cargo/config.toml +**/.cargo/config.toml +uv.lock \ No newline at end of file diff --git a/dev/release/release-tarball.sh b/dev/release/release-tarball.sh index f5e8eb1bf..2b82d1bac 100755 --- a/dev/release/release-tarball.sh +++ b/dev/release/release-tarball.sh @@ -43,7 +43,14 @@ fi version=$1 rc=$2 -tmp_dir=tmp-apache-arrow-datafusion-python-dist +read -r -p "Proceed to release tarball for ${version}-rc${rc}? [y/N]: " answer +answer=${answer:-no} +if [ "${answer}" != "y" ]; then + echo "Cancelled tarball release!" + exit 1 +fi + +tmp_dir=tmp-apache-datafusion-python-dist echo "Recreate temporary directory: ${tmp_dir}" rm -rf ${tmp_dir} @@ -52,23 +59,23 @@ mkdir -p ${tmp_dir} echo "Clone dev dist repository" svn \ co \ - https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-datafusion-python-${version}-rc${rc} \ + https://dist.apache.org/repos/dist/dev/datafusion/apache-datafusion-python-${version}-rc${rc} \ ${tmp_dir}/dev echo "Clone release dist repository" -svn co https://dist.apache.org/repos/dist/release/arrow ${tmp_dir}/release +svn co https://dist.apache.org/repos/dist/release/datafusion ${tmp_dir}/release echo "Copy ${version}-rc${rc} to release working copy" -release_version=arrow-datafusion-python-${version} +release_version=datafusion-python-${version} mkdir -p ${tmp_dir}/release/${release_version} cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ svn add ${tmp_dir}/release/${release_version} echo "Commit release" -svn ci -m "Apache Arrow DataFusion Python ${version}" ${tmp_dir}/release +svn ci -m "Apache DataFusion Python ${version}" ${tmp_dir}/release echo "Clean up" rm -rf ${tmp_dir} echo "Success! The release is available here:" -echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" +echo " https://dist.apache.org/repos/dist/release/datafusion/${release_version}" diff --git a/dev/release/update_change_log-datafusion-python.sh b/dev/release/update_change_log-datafusion-python.sh deleted file mode 100755 index a11447f0b..000000000 --- a/dev/release/update_change_log-datafusion-python.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# Usage: -# CHANGELOG_GITHUB_TOKEN= ./update_change_log-datafusion.sh main 8.0.0 7.1.0 -# CHANGELOG_GITHUB_TOKEN= ./update_change_log-datafusion.sh maint-7.x 7.1.0 7.0.0 - -RELEASE_BRANCH=$1 -RELEASE_TAG=$2 -BASE_TAG=$3 - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -${SOURCE_DIR}/update_change_log.sh \ - "${BASE_TAG}" \ - --future-release "${RELEASE_TAG}" \ - --release-branch "${RELEASE_BRANCH}" diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh deleted file mode 100755 index a0b398131..000000000 --- a/dev/release/update_change_log.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -# Adapted from https://github.com/apache/arrow-rs/tree/master/dev/release/update_change_log.sh - -# invokes the changelog generator from -# https://github.com/github-changelog-generator/github-changelog-generator -# -# With the config located in -# arrow-datafusion/.github_changelog_generator -# -# Usage: -# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh - -set -e - -SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" - -if [[ "$#" -lt 1 ]]; then - echo "USAGE: $0 SINCE_TAG EXTRA_ARGS..." - exit 1 -fi - -SINCE_TAG=$1 -shift 1 - -OUTPUT_PATH="CHANGELOG.md" - -pushd ${SOURCE_TOP_DIR} - -# reset content in changelog -git checkout "${SINCE_TAG}" "${OUTPUT_PATH}" -# remove license header so github-changelog-generator has a clean base to append -sed -i.bak '1,18d' "${OUTPUT_PATH}" - -docker run -it --rm \ - --cpus "0.1" \ - -e CHANGELOG_GITHUB_TOKEN=$CHANGELOG_GITHUB_TOKEN \ - -v "$(pwd)":/usr/local/src/your-app \ - githubchangeloggenerator/github-changelog-generator \ - --user apache \ - --project arrow-datafusion-python \ - --since-tag "${SINCE_TAG}" \ - --base "${OUTPUT_PATH}" \ - --output "${OUTPUT_PATH}" \ - "$@" - -sed -i.bak "s/\\\n/\n\n/" "${OUTPUT_PATH}" - -echo ' -' | cat - "${OUTPUT_PATH}" > "${OUTPUT_PATH}".tmp -mv "${OUTPUT_PATH}".tmp "${OUTPUT_PATH}" diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index be86f69e0..9591e0335 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -32,8 +32,8 @@ set -x set -o pipefail SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" -ARROW_DIR="$(dirname $(dirname ${SOURCE_DIR}))" -ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' +DATAFUSION_PYTHON_DIR="$(dirname $(dirname ${SOURCE_DIR}))" +DATAFUSION_PYTHON_DIST_URL='https://dist.apache.org/repos/dist/dev/datafusion' download_dist_file() { curl \ @@ -41,11 +41,11 @@ download_dist_file() { --show-error \ --fail \ --location \ - --remote-name $ARROW_DIST_URL/$1 + --remote-name $DATAFUSION_PYTHON_DIST_URL/$1 } download_rc_file() { - download_dist_file apache-arrow-datafusion-python-${VERSION}-rc${RC_NUMBER}/$1 + download_dist_file apache-datafusion-python-${VERSION}-rc${RC_NUMBER}/$1 } import_gpg_keys() { @@ -89,31 +89,40 @@ verify_dir_artifact_signatures() { setup_tempdir() { cleanup() { if [ "${TEST_SUCCESS}" = "yes" ]; then - rm -fr "${ARROW_TMPDIR}" + rm -fr "${DATAFUSION_PYTHON_TMPDIR}" else - echo "Failed to verify release candidate. See ${ARROW_TMPDIR} for details." + echo "Failed to verify release candidate. See ${DATAFUSION_PYTHON_TMPDIR} for details." fi } - if [ -z "${ARROW_TMPDIR}" ]; then - # clean up automatically if ARROW_TMPDIR is not defined - ARROW_TMPDIR=$(mktemp -d -t "$1.XXXXX") + if [ -z "${DATAFUSION_PYTHON_TMPDIR}" ]; then + # clean up automatically if DATAFUSION_PYTHON_TMPDIR is not defined + DATAFUSION_PYTHON_TMPDIR=$(mktemp -d -t "$1.XXXXX") trap cleanup EXIT else # don't clean up automatically - mkdir -p "${ARROW_TMPDIR}" + mkdir -p "${DATAFUSION_PYTHON_TMPDIR}" fi } test_source_distribution() { - # install rust toolchain in a similar fashion like test-miniconda + # install rust toolchain export RUSTUP_HOME=$PWD/test-rustup export CARGO_HOME=$PWD/test-rustup curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path - export PATH=$RUSTUP_HOME/bin:$PATH - source $RUSTUP_HOME/env + # On Unix, rustup creates an env file. On Windows GitHub runners (MSYS bash), + # that file may not exist, so fall back to adding Cargo bin directly. + if [ -f "$CARGO_HOME/env" ]; then + # shellcheck disable=SC1090 + source "$CARGO_HOME/env" + elif [ -f "$RUSTUP_HOME/env" ]; then + # shellcheck disable=SC1090 + source "$RUSTUP_HOME/env" + else + export PATH="$CARGO_HOME/bin:$PATH" + fi # build and test rust @@ -125,11 +134,21 @@ test_source_distribution() { git clone https://github.com/apache/arrow-testing.git testing git clone https://github.com/apache/parquet-testing.git parquet-testing - python3 -m venv venv - source venv/bin/activate - python3 -m pip install -U pip - python3 -m pip install -r requirements-310.txt - maturin develop + python3 -m venv .venv + if [ -x ".venv/bin/python" ]; then + VENV_PYTHON=".venv/bin/python" + elif [ -x ".venv/Scripts/python.exe" ]; then + VENV_PYTHON=".venv/Scripts/python.exe" + elif [ -x ".venv/Scripts/python" ]; then + VENV_PYTHON=".venv/Scripts/python" + else + echo "Unable to find python executable in virtual environment" + exit 1 + fi + + "$VENV_PYTHON" -m pip install -U pip + "$VENV_PYTHON" -m pip install -U maturin + "$VENV_PYTHON" -m maturin develop #TODO: we should really run tests here as well #python3 -m pytest @@ -142,11 +161,11 @@ test_source_distribution() { TEST_SUCCESS=no -setup_tempdir "arrow-${VERSION}" -echo "Working in sandbox ${ARROW_TMPDIR}" -cd ${ARROW_TMPDIR} +setup_tempdir "datafusion-python-${VERSION}" +echo "Working in sandbox ${DATAFUSION_PYTHON_TMPDIR}" +cd ${DATAFUSION_PYTHON_TMPDIR} -dist_name="apache-arrow-datafusion-python-${VERSION}" +dist_name="apache-datafusion-python-${VERSION}" import_gpg_keys fetch_archive ${dist_name} tar xf ${dist_name}.tar.gz diff --git a/dev/rust_lint.sh b/dev/rust_lint.sh index b1285cbc3..eeb9e2302 100755 --- a/dev/rust_lint.sh +++ b/dev/rust_lint.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 000000000..6e8a53b6f --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,4 @@ +pokemon.csv +yellow_trip_data.parquet +yellow_tripdata_2021-01.parquet + diff --git a/docs/Makefile b/docs/Makefile index e65c8e250..49ebae372 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -35,4 +35,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) --fail-on-warning \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index a6f4998c8..502f1c2a1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -19,46 +19,52 @@ # DataFusion Documentation -This folder contains the source content of the [python api](./source/api). -These are both published to https://arrow.apache.org/datafusion/ -as part of the release process. +This folder contains the source content of the [Python API](./source/api). +This is published to https://datafusion.apache.org/python by a GitHub action +when changes are merged to the main branch. ## Dependencies It's recommended to install build dependencies and build the documentation -inside a Python virtualenv. +inside a Python `venv` using `uv`. -- Python -- `pip install -r requirements.txt` +To prepare building the documentation run the following on the root level of the project: + +```bash +# Set up a virtual environment with the documentation dependencies +uv sync --dev --group docs --no-install-package datafusion +``` ## Build & Preview Run the provided script to build the HTML pages. ```bash -./build.sh +# Build the repository +uv run --no-project maturin develop --uv +# Build the documentation +uv run --no-project docs/build.sh ``` -The HTML will be generated into a `build` directory. +The HTML will be generated into a `build` directory in `docs`. Preview the site on Linux by running this command. ```bash -firefox build/html/index.html +firefox docs/build/html/index.html ``` ## Release Process -The documentation is served through the -[arrow-site](https://github.com/apache/arrow-site/) repo. To release a new -version of the docs, follow these steps: +This documentation is hosted at https://datafusion.apache.org/python -1. Run `./build.sh` inside `docs` folder to generate the docs website inside the `build/html` folder. -2. Clone the arrow-site repo -3. Checkout to the `asf-site` branch (NOT `master`) -4. Copy build artifacts into `arrow-site` repo's `datafusion` folder with a command such as +When the PR is merged to the `main` branch of the DataFusion +repository, a [github workflow](https://github.com/apache/datafusion-python/blob/main/.github/workflows/build.yml) which: -- `cp -rT ./build/html/ ../../arrow-site/datafusion/` (doesn't work on mac) -- `rsync -avzr ./build/html/ ../../arrow-site/datafusion/` +1. Builds the html content +2. Pushes the html content to the [`asf-site`](https://github.com/apache/datafusion-python/tree/asf-site) branch in this repository. -5. Commit changes in `arrow-site` and send a PR. \ No newline at end of file +The Apache Software Foundation provides https://arrow.apache.org/, +which serves content based on the configuration in +[.asf.yaml](https://github.com/apache/datafusion-python/blob/main/.asf.yaml), +which specifies the target as https://datafusion.apache.org/python. diff --git a/docs/build.sh b/docs/build.sh old mode 100644 new mode 100755 index 3f24f8eec..f73330323 --- a/docs/build.sh +++ b/docs/build.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -19,10 +19,23 @@ # set -e + +original_dir=$(pwd) +script_dir=$(dirname "$(realpath "$0")") +cd "$script_dir" || exit + +if [ ! -f pokemon.csv ]; then + curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv +fi + +if [ ! -f yellow_tripdata_2021-01.parquet ]; then + curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet +fi + rm -rf build 2> /dev/null rm -rf temp 2> /dev/null mkdir temp cp -rf source/* temp/ -# replace relative URLs with absolute URLs -#sed -i 's/\.\.\/\.\.\/\.\.\//https:\/\/github.com\/apache\/arrow-datafusion\/blob\/master\//g' temp/contributor-guide/index.md -make SOURCEDIR=`pwd`/temp html \ No newline at end of file +make SOURCEDIR=`pwd`/temp html + +cd "$original_dir" || exit diff --git a/docs/mdbook/README.md b/docs/mdbook/README.md new file mode 100644 index 000000000..6dae6bc62 --- /dev/null +++ b/docs/mdbook/README.md @@ -0,0 +1,33 @@ + +# DataFusion Book + +This folder builds a DataFusion user guide using [mdBook](https://github.com/rust-lang/mdBook). + +## Build and run book locally + +Build the latest files with `mdbook build`. + +Open the book locally by running `open book/index.html`. + +## Install mdBook + +Download the `mdbook` binary or run `cargo install mdbook`. + +Then manually open it, so you have permissions to run it on your Mac. + +Add it to your path with a command like this so you can easily run the commands: `mv ~/Downloads/mdbook /Users/matthew.powers/.local/bin`. diff --git a/datafusion/common.py b/docs/mdbook/book.toml similarity index 85% rename from datafusion/common.py rename to docs/mdbook/book.toml index dd56640a4..089cb9a97 100644 --- a/datafusion/common.py +++ b/docs/mdbook/book.toml @@ -15,9 +15,9 @@ # specific language governing permissions and limitations # under the License. - -from ._internal import common - - -def __getattr__(name): - return getattr(common, name) +[book] +authors = ["Apache Arrow "] +language = "en" +multilingual = false +src = "src" +title = "DataFusion Book" diff --git a/docs/mdbook/src/SUMMARY.md b/docs/mdbook/src/SUMMARY.md new file mode 100644 index 000000000..23467ed4c --- /dev/null +++ b/docs/mdbook/src/SUMMARY.md @@ -0,0 +1,25 @@ + +# Summary + +- [Index](./index.md) +- [Installation](./installation.md) +- [Quickstart](./quickstart.md) +- [Usage](./usage/index.md) + - [Create a table](./usage/create-table.md) + - [Query a table](./usage/query-table.md) + - [Viewing Query Plans](./usage/query-plans.md) \ No newline at end of file diff --git a/docs/mdbook/src/images/datafusion-jupyterlab.png b/docs/mdbook/src/images/datafusion-jupyterlab.png new file mode 100644 index 000000000..c4d46884e Binary files /dev/null and b/docs/mdbook/src/images/datafusion-jupyterlab.png differ diff --git a/docs/mdbook/src/images/plan.svg b/docs/mdbook/src/images/plan.svg new file mode 100644 index 000000000..927147985 --- /dev/null +++ b/docs/mdbook/src/images/plan.svg @@ -0,0 +1,111 @@ + + + + + + +%3 + + +cluster_1 + +LogicalPlan + + +cluster_6 + +Detailed LogicalPlan + + + +2 + +Projection: my_table.a, SUM(my_table.b) + + + +3 + +Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]] + + + +2->3 + + + + + +4 + +Filter: my_table.a < Int64(3) + + + +3->4 + + + + + +5 + +TableScan: my_table + + + +4->5 + + + + + +7 + +Projection: my_table.a, SUM(my_table.b) +Schema: [a:Int64;N, SUM(my_table.b):Int64;N] + + + +8 + +Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]] +Schema: [a:Int64;N, SUM(my_table.b):Int64;N] + + + +7->8 + + + + + +9 + +Filter: my_table.a < Int64(3) +Schema: [a:Int64;N, b:Int64;N] + + + +8->9 + + + + + +10 + +TableScan: my_table +Schema: [a:Int64;N, b:Int64;N] + + + +9->10 + + + + + diff --git a/docs/mdbook/src/index.md b/docs/mdbook/src/index.md new file mode 100644 index 000000000..2c1d217f8 --- /dev/null +++ b/docs/mdbook/src/index.md @@ -0,0 +1,43 @@ + +# DataFusion Book + +DataFusion is a blazing fast query engine that lets you run data analyses quickly and reliably. + +DataFusion is written in Rust, but also exposes Python and SQL bindings, so you can easily query data in your language of choice. You don't need to know any Rust to be a happy and productive user of DataFusion. + +DataFusion lets you run queries faster than pandas. Let's compare query runtimes for a 5GB CSV file with 100 million rows of data. + +Take a look at a few rows of the data: + +``` ++-------+-------+--------------+-----+-----+-------+----+----+-----------+ +| id1 | id2 | id3 | id4 | id5 | id6 | v1 | v2 | v3 | ++-------+-------+--------------+-----+-----+-------+----+----+-----------+ +| id016 | id016 | id0000042202 | 15 | 24 | 5971 | 5 | 11 | 37.211254 | +| id039 | id045 | id0000029558 | 40 | 49 | 39457 | 5 | 4 | 48.951141 | +| id047 | id023 | id0000071286 | 68 | 20 | 74463 | 2 | 14 | 60.469241 | ++-------+-------+--------------+-----+-----+-------+----+----+-----------+ +``` + +Suppose you'd like to run the following query: `SELECT id1, sum(v1) AS v1 from the_table GROUP BY id1`. + +If you use pandas, then this query will take 43.6 seconds to execute. + +It only takes DataFusion 9.8 seconds to execute the same query. + +DataFusion is easy to use, powerful, and fast. Let's learn more! diff --git a/docs/mdbook/src/installation.md b/docs/mdbook/src/installation.md new file mode 100644 index 000000000..b29f3b66b --- /dev/null +++ b/docs/mdbook/src/installation.md @@ -0,0 +1,63 @@ + +# Installation + +DataFusion is easy to install, just like any other Python library. + +## Using uv + +If you do not yet have a virtual environment, create one: + +```bash +uv venv +``` + +You can add datafusion to your virtual environment with the usual: + +```bash +uv pip install datafusion +``` + +Or, to add to a project: + +```bash +uv add datafusion +``` + +## Using pip + +``` bash +pip install datafusion +``` + +## uv & JupyterLab setup + +This section explains how to install DataFusion in a uv environment with other libraries that allow for a nice Jupyter workflow. This setup is completely optional. These steps are only needed if you'd like to run DataFusion in a Jupyter notebook and have an interface like this: + +![DataFusion in Jupyter](https://github.com/MrPowers/datafusion-book/raw/main/src/images/datafusion-jupyterlab.png) + +Create a virtual environment with DataFusion, Jupyter, and other useful dependencies and start the desktop application. + +```bash +uv venv +uv pip install datafusion jupyterlab jupyterlab_code_formatter +uv run jupyter lab +``` + +## Examples + +See the [DataFusion Python Examples](https://github.com/apache/arrow-datafusion-python/tree/main/examples) for a variety of Python scripts that show DataFusion in action! diff --git a/docs/mdbook/src/quickstart.md b/docs/mdbook/src/quickstart.md new file mode 100644 index 000000000..bba0b36ae --- /dev/null +++ b/docs/mdbook/src/quickstart.md @@ -0,0 +1,77 @@ + +# DataFusion Quickstart + +You can easily query a DataFusion table with the Python API or with pure SQL. + +Let's create a small DataFrame and then run some queries with both APIs. + +Start by creating a DataFrame with four rows of data and two columns: `a` and `b`. + +```python +from datafusion import SessionContext + +ctx = SessionContext() + +df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table") +``` + +Let's append a column to this DataFrame that adds columns `a` and `b` with the SQL API. + +``` +ctx.sql("select a, b, a + b as sum_a_b from my_table") + ++---+---+---------+ +| a | b | sum_a_b | ++---+---+---------+ +| 1 | 4 | 5 | +| 2 | 5 | 7 | +| 3 | 6 | 9 | +| 1 | 7 | 8 | ++---+---+---------+ +``` + +DataFusion makes it easy to run SQL queries on DataFrames. + +Now let's run the same query with the DataFusion Python API: + +```python +from datafusion import col + +df.select( + col("a"), + col("b"), + col("a") + col("b"), +) +``` + +We get the same result as before: + +``` ++---+---+-------------------------+ +| a | b | my_table.a + my_table.b | ++---+---+-------------------------+ +| 1 | 4 | 5 | +| 2 | 5 | 7 | +| 3 | 6 | 9 | +| 1 | 7 | 8 | ++---+---+-------------------------+ +``` + +DataFusion also allows you to query data with a well-designed Python interface. + +Python users have two great ways to query DataFusion tables. diff --git a/docs/mdbook/src/usage/create-table.md b/docs/mdbook/src/usage/create-table.md new file mode 100644 index 000000000..98870fac0 --- /dev/null +++ b/docs/mdbook/src/usage/create-table.md @@ -0,0 +1,59 @@ + +# DataFusion Create Table + +It's easy to create DataFusion tables from a variety of data sources. + +## Create Table from Python Dictionary + +Here's how to create a DataFusion table from a Python dictionary: + +```python +from datafusion import SessionContext + +ctx = SessionContext() + +df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table") +``` + +Supplying the `name` parameter is optional. You only need to name the table if you'd like to query it with the SQL API. + +You can also create a DataFrame without a name that can be queried with the Python API: + +```python +from datafusion import SessionContext + +ctx = SessionContext() + +df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}) +``` + +## Create Table from CSV + +You can read a CSV into a DataFusion DataFrame. Here's how to read the `G1_1e8_1e2_0_0.csv` file into a table named `csv_1e8`: + +```python +ctx.register_csv("csv_1e8", "G1_1e8_1e2_0_0.csv") +``` + +## Create Table from Parquet + +You can read a Parquet file into a DataFusion DataFrame. Here's how to read the `yellow_tripdata_2021-01.parquet` file into a table named `taxi`. + +```python +ctx.register_table("taxi", "yellow_tripdata_2021-01.parquet") +``` diff --git a/docs/mdbook/src/usage/index.md b/docs/mdbook/src/usage/index.md new file mode 100644 index 000000000..1ef4406f7 --- /dev/null +++ b/docs/mdbook/src/usage/index.md @@ -0,0 +1,25 @@ + +# Usage + +This section shows how to create DataFusion DataFrames from a variety of data sources like CSV files and Parquet files. + +You'll learn more about the SQL statements that are supported by DataFusion. + +You'll also learn about the DataFusion's Python API for querying data. + +The documentation will wrap up with a variety of real-world data processing tasks that are well suited for DataFusion. The lightning-fast speed and reliable execution makes DataFusion the best technology for a variety of data processing tasks. diff --git a/docs/mdbook/src/usage/query-plans.md b/docs/mdbook/src/usage/query-plans.md new file mode 100644 index 000000000..a39aa9e42 --- /dev/null +++ b/docs/mdbook/src/usage/query-plans.md @@ -0,0 +1,170 @@ + + +# DataFusion Query Plans + +DataFusion's `DataFrame` is a wrapper around a query plan. In this chapter we will learn how to view +logical and physical query plans for DataFrames. + +## Sample Data + +Let's go ahead and create a simple DataFrame. You can do this in the Python shell or in a notebook. + +```python +from datafusion import SessionContext + +ctx = SessionContext() + +df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table") +``` + +## Logical Plan + +Next, let's look at the logical plan for this dataframe. + +```python +>>> df.logical_plan() +TableScan: my_table +``` + +The logical plan here consists of a single `TableScan` operator. Let's make a more interesting plan by creating a new +`DataFrame` representing an aggregate query with a filter. + +```python +>>> df = ctx.sql("SELECT a, sum(b) FROM my_table WHERE a < 3 GROUP BY a") +``` + +When we view the plan for this `DataFrame` we can see that there are now four operators in the plan, each +representing a logical transformation of the data. We start with a `TableScan` to read the data, followed by +a `Filter` to filter out rows that do not match the filter expression, then an `Aggregate` is performed. Finally, +a `Projection` is applied to ensure that the order of the final columns matches the `SELECT` part of the SQL query. + +```python +>>> df.logical_plan() +Projection: my_table.a, SUM(my_table.b) + Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]] + Filter: my_table.a < Int64(3) + TableScan: my_table +``` + +## Optimized Logical Plan + +DataFusion has a powerful query optimizer which will rewrite query plans to make them more efficient before they are +executed. We can view the output of the optimized by viewint the optimized logical plan. + +```python +>>> df.optimized_logical_plan() +Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]] + Filter: my_table.a < Int64(3) + TableScan: my_table projection=[a, b] +``` + +We can see that there are two key differences compared to the unoptimized logical plan: + +- The `Projection` has been removed because it was redundant in this case (the output of the `Aggregatge` plan + already had the columns in the correct order). +- The `TableScan` now has a projection pushed down so that it only reads the columns required to be able to execute + the query. In this case the table only has two columns and we referenced them both in the query, but this optimization + can be very effective in real-world queries against large tables. + +## Physical Plan + +Logical plans provide a representation of "what" the query should do it. Physical plans explain "how" the query +should be executed. + +We can view the physical plan (also known as an execution plan) using the `execution_plan` method. + +```python +>>> df.execution_plan() +AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[SUM(my_table.b)] + CoalesceBatchesExec: target_batch_size=8192 + RepartitionExec: partitioning=Hash([Column { name: "a", index: 0 }], 48), input_partitions=48 + AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[SUM(my_table.b)] + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: a@0 < 3 + RepartitionExec: partitioning=RoundRobinBatch(48), input_partitions=1 + MemoryExec: partitions=1, partition_sizes=[1] +``` + +The `TableScan` has now been replaced by a more specific `MemoryExec` for scanning the in-memory data. If we were +querying a CSV file on disk then we would expect to see a `CsvExec` instead. + +This plan has additional operators that were not in the logical plan: + +- `RepartionExec` has been added so that the data can be split into partitions and processed in parallel using + multiple cores. +- `CoalesceBatchesExec` will combine small batches into larger batches to ensure that processing remains efficient. + +The `Aggregate` operator now appears twice. This is because aggregates are performed in a two step process. Data is +aggregated within each partition in parallel and then those results (which could contain duplicate grouping keys) are +combined and the aggregate operations is applied again. + +## Creating Query Plan Diagrams + +DataFusion supports generating query plan diagrams in [DOT format](). + +DOT is a language for describing graphs and there are open source tools such as GraphViz that can render diagrams +from DOT files. + +We can use the following code to generate a DOT file for a logical query plan. + +```python +>>> diagram = df.logical_plan().display_graphviz() +>>> with open('plan.dot', 'w') as f: +>>> f.write(diagram) +``` + +If we view the view, we will see the following content. + +``` +// Begin DataFusion GraphViz Plan (see https://graphviz.org) +digraph { + subgraph cluster_1 + { + graph[label="LogicalPlan"] + 2[shape=box label="Projection: my_table.a, SUM(my_table.b)"] + 3[shape=box label="Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]]"] + 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back] + 4[shape=box label="Filter: my_table.a < Int64(3)"] + 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back] + 5[shape=box label="TableScan: my_table"] + 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back] + } + subgraph cluster_6 + { + graph[label="Detailed LogicalPlan"] + 7[shape=box label="Projection: my_table.a, SUM(my_table.b)\nSchema: [a:Int64;N, SUM(my_table.b):Int64;N]"] + 8[shape=box label="Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]]\nSchema: [a:Int64;N, SUM(my_table.b):Int64;N]"] + 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back] + 9[shape=box label="Filter: my_table.a < Int64(3)\nSchema: [a:Int64;N, b:Int64;N]"] + 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back] + 10[shape=box label="TableScan: my_table\nSchema: [a:Int64;N, b:Int64;N]"] + 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back] + } +} +// End DataFusion GraphViz Plan +``` + +We can use GraphViz from the command-line to convert this DOT file into an image. + +```bash +dot -Tsvg plan.dot > plan.svg +``` + +This generates the following diagram: + +![Query Plan Diagram](../images/plan.svg) diff --git a/docs/mdbook/src/usage/query-table.md b/docs/mdbook/src/usage/query-table.md new file mode 100644 index 000000000..5e4e38001 --- /dev/null +++ b/docs/mdbook/src/usage/query-table.md @@ -0,0 +1,125 @@ + +# DataFusion Query Table + +DataFusion tables can be queried with SQL or with the Python API. + +Let's create a small table and show the different types of queries that can be run. + +```python +df = ctx.from_pydict( + { + "first_name": ["li", "wang", "ron", "amanda"], + "age": [25, 75, 68, 18], + "country": ["china", "china", "us", "us"], + }, + name="some_people", +) +``` + +Here's the data in the table: + +``` ++------------+-----+---------+ +| first_name | age | country | ++------------+-----+---------+ +| li | 25 | china | +| wang | 75 | china | +| ron | 68 | us | +| amanda | 18 | us | ++------------+-----+---------+ +``` + +## DataFusion Filter DataFrame + +Here's how to find all individuals that are older than 65 years old in the data with SQL: + +``` +ctx.sql("select * from some_people where age > 65") + ++------------+-----+---------+ +| first_name | age | country | ++------------+-----+---------+ +| wang | 75 | china | +| ron | 68 | us | ++------------+-----+---------+ +``` + +Here's how to run the same query with Python: + +```python +df.filter(col("age") > lit(65)) +``` + +``` ++------------+-----+---------+ +| first_name | age | country | ++------------+-----+---------+ +| wang | 75 | china | +| ron | 68 | us | ++------------+-----+---------+ +``` + +## DataFusion Select Columns from DataFrame + +Here's how to select the `first_name` and `country` columns from the DataFrame with SQL: + +``` +ctx.sql("select first_name, country from some_people") + + ++------------+---------+ +| first_name | country | ++------------+---------+ +| li | china | +| wang | china | +| ron | us | +| amanda | us | ++------------+---------+ +``` + +Here's how to run the same query with Python: + +```python +df.select(col("first_name"), col("country")) +``` + +``` ++------------+---------+ +| first_name | country | ++------------+---------+ +| li | china | +| wang | china | +| ron | us | +| amanda | us | ++------------+---------+ +``` + +## DataFusion Aggregation Query + +Here's how to run a group by aggregation query: + +``` +ctx.sql("select country, count(*) as num_people from some_people group by country") + ++---------+------------+ +| country | num_people | ++---------+------------+ +| china | 2 | +| us | 2 | ++---------+------------+ +``` diff --git a/docs/source/_static/images/2x_bgwhite_original.png b/docs/source/_static/images/2x_bgwhite_original.png new file mode 100644 index 000000000..abb5fca6e Binary files /dev/null and b/docs/source/_static/images/2x_bgwhite_original.png differ diff --git a/docs/source/_static/images/DataFusion-Logo-Background-White.png b/docs/source/_static/images/DataFusion-Logo-Background-White.png deleted file mode 100644 index 023c2373f..000000000 Binary files a/docs/source/_static/images/DataFusion-Logo-Background-White.png and /dev/null differ diff --git a/docs/source/_static/images/DataFusion-Logo-Background-White.svg b/docs/source/_static/images/DataFusion-Logo-Background-White.svg deleted file mode 100644 index b3bb47c5e..000000000 --- a/docs/source/_static/images/DataFusion-Logo-Background-White.svg +++ /dev/null @@ -1 +0,0 @@ -DataFUSION-Logo-Dark \ No newline at end of file diff --git a/docs/source/_static/images/DataFusion-Logo-Dark.png b/docs/source/_static/images/DataFusion-Logo-Dark.png deleted file mode 100644 index cc60f12a0..000000000 Binary files a/docs/source/_static/images/DataFusion-Logo-Dark.png and /dev/null differ diff --git a/docs/source/_static/images/DataFusion-Logo-Dark.svg b/docs/source/_static/images/DataFusion-Logo-Dark.svg deleted file mode 100644 index e16f24443..000000000 --- a/docs/source/_static/images/DataFusion-Logo-Dark.svg +++ /dev/null @@ -1 +0,0 @@ -DataFUSION-Logo-Dark \ No newline at end of file diff --git a/docs/source/_static/images/DataFusion-Logo-Light.png b/docs/source/_static/images/DataFusion-Logo-Light.png deleted file mode 100644 index 8992213b0..000000000 Binary files a/docs/source/_static/images/DataFusion-Logo-Light.png and /dev/null differ diff --git a/docs/source/_static/images/DataFusion-Logo-Light.svg b/docs/source/_static/images/DataFusion-Logo-Light.svg deleted file mode 100644 index b3bef2193..000000000 --- a/docs/source/_static/images/DataFusion-Logo-Light.svg +++ /dev/null @@ -1 +0,0 @@ -DataFUSION-Logo-Light \ No newline at end of file diff --git a/docs/source/_static/images/original.png b/docs/source/_static/images/original.png new file mode 100644 index 000000000..687f94676 Binary files /dev/null and b/docs/source/_static/images/original.png differ diff --git a/docs/source/_static/images/original.svg b/docs/source/_static/images/original.svg new file mode 100644 index 000000000..6ba0ece99 --- /dev/null +++ b/docs/source/_static/images/original.svg @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/source/_static/images/original2x.png b/docs/source/_static/images/original2x.png new file mode 100644 index 000000000..a7402109b Binary files /dev/null and b/docs/source/_static/images/original2x.png differ diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index 1e972cc6f..aaa40fba2 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -56,7 +56,7 @@ a.navbar-brand img { /* This is the bootstrap CSS style for "table-striped". Since the theme does -not yet provide an easy way to configure this globaly, it easier to simply +not yet provide an easy way to configure this globally, it easier to simply include this snippet here than updating each table in all rst files to add ":class: table-striped" */ diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html index bc2bf0092..44deeed25 100644 --- a/docs/source/_templates/docs-sidebar.html +++ b/docs/source/_templates/docs-sidebar.html @@ -1,6 +1,6 @@ - +