diff --git a/.github/workflows/dbr-lts-install.yml b/.github/workflows/dbr-lts-install.yml new file mode 100644 index 000000000..ad8e0ad6f --- /dev/null +++ b/.github/workflows/dbr-lts-install.yml @@ -0,0 +1,147 @@ +name: DBR LTS Install + +# Installs the CI-built connector wheel INSIDE real DBR LTS clusters and runs a +# SELECT 1 smoke test. This reproduces the customer install path that broke in +# ES-1960554 (thrift 0.23.0's sdist failing to build under the OLD setuptools +# shipped by DBR LTS) -- something the poetry-install-based unit CI can't see, +# because it never does a fresh `pip install` of the built artifact on an LTS +# toolchain. See scripts/dbr_lts_install_check.py + scripts/dbr_lts_smoke_notebook.py. +# +# Trigger: pull_request, but the real matrix runs ONLY when dependency-affecting +# files change (pyproject.toml / poetry.lock / this workflow / the two scripts). +# Dependency changes are the only surface that can introduce this class of +# failure, so unrelated PRs skip the (cluster-backed, slow) matrix entirely. +# This is an informational check, not a required gate. +# +# External setup this workflow depends on (all ALREADY present in azure-prod): +# - DATABRICKS_HOST / DATABRICKS_TOKEN (PECO service-principal PAT) and +# TEST_PECO_WAREHOUSE_HTTP_PATH -- the three secrets referenced below. +# - A writable UC Volume at CI_VOLUME (managed volume). The notebook-import +# dir is derived from the token's own identity (no hardcoded principal). +# +# The supported-LTS matrix tracks the Databricks Runtime support lifecycle: +# https://learn.microsoft.com/azure/databricks/release-notes/runtime/ +# 10.4 / 11.3 / 12.2 LTS are end-of-support and intentionally excluded. Revisit +# when an LTS EOLs (13.3 -> Aug 22, 2026) or a new LTS GAs. + +on: + pull_request: + +permissions: + contents: read + id-token: write + +concurrency: + group: dbr-lts-${{ github.ref }} + cancel-in-progress: true + +env: + CI_VOLUME: /Volumes/peco/default/ci_wheels + +jobs: + # ─────────────────────────────────────────────────────────────── + # Detect whether dependency-affecting files changed on this PR. + # ─────────────────────────────────────────────────────────────── + detect-changes: + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest + outputs: + run_tests: ${{ steps.changed.outputs.run_tests }} + steps: + - name: Check out repo + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + fetch-depth: 0 + - name: Detect dependency changes + id: changed + env: + BASE_SHA: ${{ github.event.pull_request.base.sha }} + HEAD_SHA: ${{ github.event.pull_request.head.sha }} + run: | + CHANGED=$(git diff --name-only "$BASE_SHA" "$HEAD_SHA") + echo "Changed files:"; echo "$CHANGED" + if echo "$CHANGED" | grep -qE "^(pyproject\.toml|poetry\.lock|\.github/workflows/dbr-lts-install\.yml|scripts/dbr_lts_install_check\.py|scripts/dbr_lts_smoke_notebook\.py)$"; then + echo "run_tests=true" >> "$GITHUB_OUTPUT" + echo "Dependency-affecting files changed — will run the LTS matrix" + else + echo "run_tests=false" >> "$GITHUB_OUTPUT" + echo "No dependency changes — skipping the LTS matrix" + fi + + # ─────────────────────────────────────────────────────────────── + # Build the wheel once per matrix leg and install it on the target DBR + # LTS runtime with the given extras. + # + # Matrix = supported LTS × install-target {base, pyarrow, kernel}. The + # [kernel] extra is a no-op below Python 3.10; all current LTS are ≥3.10 + # so every leg is meaningful today. If a future LTS ships Python <3.10, + # drop its kernel leg from `include`. + # ─────────────────────────────────────────────────────────────── + lts-install: + needs: detect-changes + if: needs.detect-changes.outputs.run_tests == 'true' + runs-on: + group: databricks-protected-runner-group + labels: linux-ubuntu-latest + environment: azure-prod + permissions: + contents: read + id-token: write + strategy: + fail-fast: false + matrix: + include: + # 13.3 LTS (Python 3.10) — EOL Aug 22, 2026 + - { spark: "13.3.x-scala2.12", extras: "" } + - { spark: "13.3.x-scala2.12", extras: "pyarrow" } + - { spark: "13.3.x-scala2.12", extras: "kernel" } + # 14.3 LTS (Python 3.10) — broke in ES-1960554 + - { spark: "14.3.x-scala2.12", extras: "" } + - { spark: "14.3.x-scala2.12", extras: "pyarrow" } + - { spark: "14.3.x-scala2.12", extras: "kernel" } + # 15.4 LTS (Python 3.11) — broke in ES-1960554 + - { spark: "15.4.x-scala2.12", extras: "" } + - { spark: "15.4.x-scala2.12", extras: "pyarrow" } + - { spark: "15.4.x-scala2.12", extras: "kernel" } + # 16.4 LTS (Python 3.12) + - { spark: "16.4.x-scala2.12", extras: "" } + - { spark: "16.4.x-scala2.12", extras: "pyarrow" } + - { spark: "16.4.x-scala2.12", extras: "kernel" } + # 17.3 LTS (Python 3.12) + - { spark: "17.3.x-scala2.13", extras: "" } + - { spark: "17.3.x-scala2.13", extras: "pyarrow" } + - { spark: "17.3.x-scala2.13", extras: "kernel" } + env: + DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} + # Driver -> workspace API (jobs/scim/files) uses OAuth M2M as the PECO + # service principal. A plain PAT (DATABRICKS_TOKEN) is warehouse-scoped + # and is rejected by the workspace REST API ("Invalid access token"). + DATABRICKS_CLIENT_ID: ${{ secrets.TEST_PECO_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.TEST_PECO_SP_OAUTH_SECRET }} + DATABRICKS_HTTP_PATH: ${{ secrets.TEST_PECO_WAREHOUSE_HTTP_PATH }} + steps: + - name: Check out repository + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.11" + - name: Configure JFrog pip index (OIDC) + uses: ./.github/actions/setup-jfrog + - name: Install build + driver tooling + run: pip install "poetry==2.2.1" "databricks-sdk>=0.20,<1" + - name: Build the connector wheel + run: poetry build -f wheel + - name: Install wheel on ${{ matrix.spark }} (extras=${{ matrix.extras || 'base' }}) + run: | + WHEEL=$(ls dist/databricks_sql_connector-*.whl | head -1) + echo "Built wheel: $WHEEL" + python scripts/dbr_lts_install_check.py \ + --wheel "$WHEEL" \ + --smoke-notebook scripts/dbr_lts_smoke_notebook.py \ + --spark-version "${{ matrix.spark }}" \ + --extras "${{ matrix.extras }}" \ + --volume "${CI_VOLUME}" \ + --run-id "${GITHUB_RUN_ID}-${{ matrix.spark }}-${{ matrix.extras || 'base' }}" \ + --http-path "${DATABRICKS_HTTP_PATH}" diff --git a/pyproject.toml b/pyproject.toml index 8d3d89d2e..b56055ad5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,15 @@ include = ["CHANGELOG.md"] [tool.poetry.dependencies] python = "^3.8.0" +# Keep the cap at <0.23.0. thrift ships sdist-only, and thrift 0.23.0's +# setup.py calls sys.exit(0) on the build-success path, which kills the PEP +# 517 build backend before pip writes output.json. On the OLD setuptools +# shipped by DBR LTS (14.3 / 15.4) this is a hard `pip install` failure -- +# the SEV0 ES-1960554 incident (4.2.7 widened this to <0.24.0 and was yanked; +# see PR #840). Do NOT re-widen until a fixed thrift (0.23.1 / 0.24.0, per +# THRIFT-6067 / apache/thrift#3584) ships. The `DBR LTS Install` CI check +# (.github/workflows/dbr-lts-install.yml) installs the built wheel on real DBR +# LTS clusters and catches this regression when dependency pins change. thrift = "~=0.22.0" pandas = [ { version = ">=1.2.5,<4.0.0", python = ">=3.8,<3.13" }, diff --git a/scripts/dbr_lts_install_check.py b/scripts/dbr_lts_install_check.py new file mode 100644 index 000000000..373445da6 --- /dev/null +++ b/scripts/dbr_lts_install_check.py @@ -0,0 +1,279 @@ +"""Install the CI-built connector wheel inside a real DBR LTS cluster and smoke-test it. + +This is the GitHub-side driver for the "DBR LTS Install" check. It reproduces the +exact customer install path that broke in ES-1960554 (thrift 0.23.0's sdist failing +to build under the old setuptools shipped by DBR LTS runtimes) -- something the +poetry-install-based unit CI cannot see, because it never does a fresh `pip install` +of the built artifact on an LTS toolchain. + +Flow (per DBR LTS version x install target): + 1. Upload the wheel to a UC Volume on the target workspace. + 2. Import the smoke notebook into the workspace. + 3. Submit a one-off Job on an ephemeral single-node SINGLE_USER job cluster pinned + to the given `spark_version`; the notebook pip-installs the wheel (+ extras) and + runs a SELECT 1 smoke query. + 4. Poll to completion, surface the run's error, and exit non-zero on any non-SUCCESS. + +Auth: env DATABRICKS_HOST + DATABRICKS_TOKEN (the azure-prod CI secrets). + +Several non-obvious cluster constraints are baked in below and MUST NOT be removed +without re-validating -- see the inline comments and the block at submit(): + - notebook_task (not spark_python_task from a Volume), + - the cluster must be SINGLE_USER access mode to read /Volumes, + - the wheel is copied off /Volumes with dbutils.fs.cp inside the notebook, + - the notebook calls dbutils.library.restartPython() after install. + +Example: + python scripts/dbr_lts_install_check.py \ + --wheel dist/databricks_sql_connector-4.3.0-py3-none-any.whl \ + --smoke-notebook scripts/dbr_lts_smoke_notebook.py \ + --workspace-dir /Users/ci-sp/dbr_lts_install \ + --spark-version 15.4.x-scala2.12 \ + --extras pyarrow \ + --volume /Volumes/peco/default/ci_wheels \ + --run-id "${GITHUB_RUN_ID}-15.4-pyarrow" \ + --http-path "${DATABRICKS_HTTP_PATH}" +""" +import argparse +import base64 +import os +import sys +import time + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service import compute, jobs, workspace + +# Terminal life-cycle states -- once the run reaches one of these, polling stops. +_TERMINAL = ( + jobs.RunLifeCycleState.TERMINATED, + jobs.RunLifeCycleState.SKIPPED, + jobs.RunLifeCycleState.INTERNAL_ERROR, +) + + +def upload_wheel(w: WorkspaceClient, local_path: str, remote_path: str) -> None: + print(f"uploading {local_path} -> {remote_path}", flush=True) + with open(local_path, "rb") as f: + w.files.upload(remote_path, f, overwrite=True) + + +def import_notebook(w: WorkspaceClient, local_path: str, remote_path: str) -> None: + """Import a source .py notebook into the workspace. + + The cluster identity cannot reliably read a spark_python_task.python_file off a + UC Volume, but a workspace notebook_task is read via the control plane and works + -- so the smoke SOURCE lives in the workspace (the wheel still comes from the + Volume via dbutils.fs.cp inside the notebook). + """ + print(f"importing notebook {local_path} -> {remote_path}", flush=True) + with open(local_path, "rb") as f: + content = base64.b64encode(f.read()).decode() + w.workspace.mkdirs(remote_path.rsplit("/", 1)[0]) + w.workspace.import_( + path=remote_path, + format=workspace.ImportFormat.SOURCE, + language=workspace.Language.PYTHON, + content=content, + overwrite=True, + ) + + +def cleanup(w: WorkspaceClient, wheel_remote: str, nb_remote: str) -> None: + """Best-effort removal of the per-run artifacts this driver created. + + Each run writes a unique wheel dir on the Volume and a unique notebook in + the workspace; delete both so they don't accumulate. Never fails the run -- + cleanup errors are logged, not raised. The ephemeral job cluster + auto-terminates on its own. + """ + wheel_dir = wheel_remote.rsplit("/", 1)[0] + # delete_directory requires an empty dir, so remove the wheel file first. + for what, fn in ( + ("wheel file", lambda: w.files.delete(wheel_remote)), + ("wheel dir", lambda: w.files.delete_directory(wheel_dir)), + ("notebook", lambda: w.workspace.delete(nb_remote)), + ): + try: + fn() + print(f"cleaned up {what}", flush=True) + except Exception as e: # noqa: BLE001 + print(f"(cleanup of {what} failed, ignoring: {e})", flush=True) + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--wheel", required=True, help="path to the built connector wheel") + ap.add_argument( + "--smoke-notebook", required=True, help="path to the smoke notebook .py" + ) + ap.add_argument( + "--workspace-dir", + default=None, + help="workspace dir to import the notebook into " + "(default: /Users//dbr_lts_install)", + ) + ap.add_argument( + "--spark-version", required=True, help="DBR runtime key, e.g. 15.4.x-scala2.12" + ) + ap.add_argument( + "--extras", + default="", + help="extra to install: '' (base), 'pyarrow', or 'kernel'", + ) + ap.add_argument( + "--node-type", default="Standard_DS3_v2", help="cluster node type id" + ) + ap.add_argument( + "--volume", required=True, help="UC Volume dir, /Volumes///" + ) + ap.add_argument( + "--run-id", + required=True, + help="unique per-run id for the volume/notebook paths", + ) + ap.add_argument( + "--http-path", required=True, help="SQL warehouse http_path for the smoke query" + ) + ap.add_argument( + "--timeout", type=int, default=1800, help="seconds to wait for the run" + ) + args = ap.parse_args() + + host = os.environ["DATABRICKS_HOST"] + if not host.startswith("http"): + host = "https://" + host + + # Auth is OAuth M2M (service principal) throughout: the driver -> workspace + # API (jobs/scim/files), AND the notebook's connector -> SQL warehouse smoke + # query (via credentials_provider, same SP). A plain PAT is warehouse-scoped + # and rejected by the workspace REST API; and a PAT was also rejected by the + # warehouse from inside the cluster -- one M2M identity avoids both. + client_id = os.environ.get("DATABRICKS_CLIENT_ID") + client_secret = os.environ.get("DATABRICKS_CLIENT_SECRET") + if not (client_id and client_secret): + sys.exit( + "DATABRICKS_CLIENT_ID and DATABRICKS_CLIENT_SECRET are required " + "(OAuth M2M service principal)." + ) + print("driver auth: oauth-m2m (service principal)", flush=True) + w = WorkspaceClient( + host=host, + client_id=client_id, + client_secret=client_secret, + auth_type="oauth-m2m", + ) + + # The ephemeral cluster runs SINGLE_USER access mode as this identity so it is + # UC-enabled and can read the wheel off /Volumes. A no-isolation single-node + # cluster gets "No Unity API token found in Unity Scope" touching /Volumes. + # The identity also anchors the default notebook-import dir, so nothing about + # the run-as principal is hardcoded (works for whatever SP owns the token). + me = w.current_user.me().user_name + print(f"run-as identity: {me}", flush=True) + workspace_dir = args.workspace_dir or f"/Users/{me}/dbr_lts_install" + + wheel_remote = f"{args.volume}/{args.run_id}/{os.path.basename(args.wheel)}" + nb_remote = f"{workspace_dir}/lts_smoke_{args.run_id}" + try: + _run(w, args, host, client_id, client_secret, me, wheel_remote, nb_remote) + finally: + # Remove the per-run artifacts on every exit path (success, failure, + # timeout, or sys.exit -- SystemExit propagates through finally). + cleanup(w, wheel_remote, nb_remote) + + +def _run(w, args, host, client_id, client_secret, me, wheel_remote, nb_remote): + upload_wheel(w, args.wheel, wheel_remote) + import_notebook(w, args.smoke_notebook, nb_remote) + + label = f"{args.spark_version} extras={args.extras or 'base'}" + print(f"\nsubmitting DBR LTS install job: {label}", flush=True) + run = w.jobs.submit( + run_name=f"ci-dbr-lts-install-{args.run_id}", + tasks=[ + jobs.SubmitTask( + task_key="lts_install_smoke", + new_cluster=compute.ClusterSpec( + spark_version=args.spark_version, + node_type_id=args.node_type, + num_workers=0, # single node + # SINGLE_USER access mode -> UC-enabled -> can read /Volumes. + data_security_mode=compute.DataSecurityMode.SINGLE_USER, + single_user_name=me, + spark_conf={ + "spark.databricks.cluster.profile": "singleNode", + "spark.master": "local[*]", + }, + custom_tags={"ResourceClass": "SingleNode"}, + ), + notebook_task=jobs.NotebookTask( + notebook_path=nb_remote, + base_parameters={ + "wheel": wheel_remote, + "extras": args.extras, + "server_hostname": host.replace("https://", ""), + "http_path": args.http_path, + "client_id": client_id, + "client_secret": client_secret, + }, + ), + ) + ], + ) + run_id = run.run_id + print(f"submitted run_id={run_id} {host}/#job/run/{run_id}", flush=True) + + deadline = time.time() + args.timeout + state = None + consecutive_errors = 0 + while time.time() < deadline: + # Tolerate transient polling errors (token refresh, network blips): a + # single failed get_run must NOT sink the whole check with a spurious + # failure. Only give up after several consecutive errors. + try: + state = w.jobs.get_run(run_id).state + consecutive_errors = 0 + except Exception as e: # noqa: BLE001 + consecutive_errors += 1 + print( + f" [{int(time.time())}] poll error ({consecutive_errors}/5): {e}", + flush=True, + ) + if consecutive_errors >= 5: + print("giving up after 5 consecutive poll errors", flush=True) + sys.exit(2) + time.sleep(20) + continue + print( + f" [{int(time.time())}] life_cycle={state.life_cycle_state} " + f"result={state.result_state}", + flush=True, + ) + if state.life_cycle_state in _TERMINAL: + break + time.sleep(20) + else: + print(f"TIMEOUT after {args.timeout}s waiting for run {run_id}", flush=True) + sys.exit(2) + + # Surface the task run's error (cell stdout is not captured here; the notebook + # folds pip's stderr tail into the raised exception so it lands in error). + try: + task_run_id = w.jobs.get_run(run_id).tasks[0].run_id + out = w.jobs.get_run_output(task_run_id) + if out.error: + print("\n===== run error =====\n" + out.error, flush=True) + except Exception as e: # noqa: BLE001 + print(f"(could not fetch run output: {e})", flush=True) + + result = state.result_state if state else None + print( + f"\nFINAL result_state={result} on {args.spark_version} (extras={args.extras or 'base'})", + flush=True, + ) + if result != jobs.RunResultState.SUCCESS: + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/dbr_lts_smoke_notebook.py b/scripts/dbr_lts_smoke_notebook.py new file mode 100644 index 000000000..909ecb3d3 --- /dev/null +++ b/scripts/dbr_lts_smoke_notebook.py @@ -0,0 +1,140 @@ +# Databricks notebook source +# DBR LTS install smoke check. Runs INSIDE a job cluster on a DBR LTS runtime, +# driven by scripts/dbr_lts_install_check.py. It reproduces the real customer +# install path that broke in ES-1960554: a fresh `pip install` of the built +# connector wheel, which resolves + builds transitive deps (notably thrift's +# sdist) under the cluster's OWN, old, setuptools -- then imports the connector +# and runs SELECT 1 against the PECO SQL warehouse. +# +# Params arrive as job notebook_task base_parameters -> dbutils widgets. + +# COMMAND ---------- +import subprocess +import sys + +dbutils.widgets.text("wheel", "") +dbutils.widgets.text("extras", "") +dbutils.widgets.text("server_hostname", "") +dbutils.widgets.text("http_path", "") +dbutils.widgets.text("client_id", "") +dbutils.widgets.text("client_secret", "") + +wheel = dbutils.widgets.get("wheel") +extras = dbutils.widgets.get("extras") + +print("=== runtime toolchain ===") +print("python:", sys.version) +for mod in ("setuptools", "pip", "wheel"): + try: + m = __import__(mod) + print(f"{mod}:", getattr(m, "__version__", "?")) + except Exception as e: # noqa: BLE001 + print(f"{mod}: {e}") + +# COMMAND ---------- +# Copy the wheel off the UC Volume to local disk with dbutils.fs (which HAS +# Volume access). Raw open()/shutil on the /Volumes FUSE mount raises +# "PermissionError: [Errno 1] Operation not permitted" from the driver Python, +# and the cluster must be SINGLE_USER access mode or dbutils.fs itself fails +# with "No Unity API token found in Unity Scope". We then pip-install the LOCAL +# copy, so the install itself -- the resolve + sdist build of transitive deps +# under the cluster's OWN toolchain -- is exactly the ES-1960554 path. +local_wheel = "/tmp/" + wheel.rsplit("/", 1)[-1] +dbutils.fs.cp("dbfs:" + wheel, "file:" + local_wheel) +print("copied wheel ->", local_wheel) + +target = local_wheel + (f"[{extras}]" if extras else "") +print(f"=== pip install {target} ===") +res = subprocess.run( + [sys.executable, "-m", "pip", "install", "--disable-pip-version-check", target], + capture_output=True, + text=True, +) +print(res.stdout) +print(res.stderr) +if res.returncode != 0: + # Fold pip's tail into the exception so the reason lands in the job run's + # error (cell stdout is NOT captured by jobs.get_run_output). + tail = (res.stdout + "\n" + res.stderr).strip().splitlines()[-25:] + raise RuntimeError( + f"pip install failed (rc={res.returncode}) for {target}\n" + "\n".join(tail) + ) + +# COMMAND ---------- +# Upgrade databricks-sdk for the SMOKE HARNESS only (separate from the wheel +# under test). Older DBR LTS (13.3 / 14.3) ship an SDK too old to honor +# auth_type="oauth-m2m" -- oauth_service_principal falls through to +# DefaultCredentials and raises "cannot configure default credentials". A +# current SDK fixes the M2M credential provider used by the smoke query below. +sdk_res = subprocess.run( + [ + sys.executable, + "-m", + "pip", + "install", + "--disable-pip-version-check", + "-U", + "databricks-sdk", + ], + capture_output=True, + text=True, +) +print(sdk_res.stdout[-2000:]) +if sdk_res.returncode != 0: + tail = (sdk_res.stdout + "\n" + sdk_res.stderr).strip().splitlines()[-15:] + raise RuntimeError("databricks-sdk upgrade failed\n" + "\n".join(tail)) + +# COMMAND ---------- +# DBR pre-seeds a `databricks` namespace package at +# /databricks/spark/python/databricks ahead of the pip-installed connector on +# sys.path, so `from databricks import sql` raises ImportError until Python is +# restarted. This is the supported way to make a notebook-scoped install take +# effect. NOTE: all Python state is wiped -- widgets are re-read below. +dbutils.library.restartPython() + +# COMMAND ---------- +# Fresh interpreter: re-read widgets, then import + smoke query. +server_hostname = dbutils.widgets.get("server_hostname") +http_path = dbutils.widgets.get("http_path") +client_id = dbutils.widgets.get("client_id") +client_secret = dbutils.widgets.get("client_secret") + +from importlib.metadata import version + +print("databricks-sql-connector:", version("databricks-sql-connector")) +print("thrift:", version("thrift")) + +print("=== smoke query (SELECT 1) ===") +from databricks import sql +from databricks.sdk.core import Config, oauth_service_principal + + +# OAuth M2M (service principal) -- same identity the driver uses. Avoids +# depending on a warehouse-scoped PAT (which the SQL warehouse rejected as +# "Invalid access token" from inside the cluster). Mirrors examples/m2m_oauth.py. +def credential_provider(): + return oauth_service_principal( + Config( + host=f"https://{server_hostname}", + client_id=client_id, + client_secret=client_secret, + # Explicit so an ambient DATABRICKS_TOKEN on the cluster doesn't + # collide ("more than one authorization method configured"). + auth_type="oauth-m2m", + ) + ) + + +with sql.connect( + server_hostname=server_hostname, + http_path=http_path, + credentials_provider=credential_provider, +) as conn: + with conn.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None and row[0] == 1, f"unexpected result: {row!r}" + print("SELECT 1 ->", row[0]) + +print("SMOKE OK") +dbutils.notebook.exit("SMOKE OK") diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py index e7b5337f6..e4d0ee325 100644 --- a/src/databricks/sql/utils.py +++ b/src/databricks/sql/utils.py @@ -894,7 +894,23 @@ def concat_table_chunks( result_table[j].extend(table_chunks[i].column_table[j]) return ColumnTable(result_table, table_chunks[0].column_names) else: + return _concat_arrow_tables(table_chunks) + + +def _concat_arrow_tables(table_chunks: List["pyarrow.Table"]) -> "pyarrow.Table": + """Concatenate Arrow tables, tolerant of the installed pyarrow version. + + ``promote_options`` was added in pyarrow 14.0.0, replacing the older + ``promote`` boolean. The connector's declared floor is pyarrow>=14, but a + base install (no ``[pyarrow]`` extra) can run against a runtime's older + bundled pyarrow -- e.g. DBR 13.3 (12.x) / 14.3 -- where ``promote_options`` + raises ``TypeError: unexpected keyword argument``. Fall back to the legacy + ``promote=True`` there (equivalent to ``promote_options="default"``). + """ + try: return pyarrow.concat_tables(table_chunks, promote_options="default") + except TypeError: + return pyarrow.concat_tables(table_chunks, promote=True) def serialize_query_tags( diff --git a/tests/unit/test_util.py b/tests/unit/test_util.py index 687bdd391..94a96b54a 100644 --- a/tests/unit/test_util.py +++ b/tests/unit/test_util.py @@ -152,6 +152,28 @@ def test_concat_table_chunks_arrow_table(self): assert result_table.column("col1").to_pylist() == [1, 2, 3, 4] assert result_table.column("col2").to_pylist() == [5, 6, 7, 8] + @pytest.mark.skipif(pyarrow is None, reason="PyArrow is not installed") + def test_concat_table_chunks_arrow_old_pyarrow_fallback(self, monkeypatch): + # pyarrow < 14 has no `promote_options` kwarg; a base connector install + # can run against a runtime's old bundled pyarrow (e.g. DBR 13.3/14.3). + # concat_table_chunks must fall back to the legacy `promote=True` API + # rather than raise TypeError (regression: DBR LTS install check). + arrow_table1 = pyarrow.Table.from_pydict({"col1": [1, 2]}) + arrow_table2 = pyarrow.Table.from_pydict({"col1": [3, 4]}) + + real_concat = pyarrow.concat_tables + + def old_concat(tables, promote_options=None, **kwargs): + if promote_options is not None: + raise TypeError( + "concat_tables() got an unexpected keyword argument 'promote_options'" + ) + return real_concat(tables, **kwargs) + + monkeypatch.setattr(pyarrow, "concat_tables", old_concat) + result_table = concat_table_chunks([arrow_table1, arrow_table2]) + assert result_table.column("col1").to_pylist() == [1, 2, 3, 4] + def test_concat_table_chunks_empty(self): result_table = concat_table_chunks([]) assert result_table == []