From 09176220623ed125a760b5ea504d3ed617988f27 Mon Sep 17 00:00:00 2001 From: augray Date: Tue, 15 Oct 2024 12:37:31 -0400 Subject: [PATCH 1/3] Support custom annotations for Ray pods (#1131) Some k8s infrastructure may require custom annotations and/or labels to be applied to pods in the cluster (ex: custom autoscalers). Sematic already supports control over this for runner and standalone job pods, but doesn't for pods created via the Ray integration. This PR adds that support. The integration works by adding helm-configurable annotations and labels. The integration works by allowing you to specify annotations or labels to apply to ALL pods created by Sematic's ray integration. You can specify different configurations for pods with/without GPUs but otherwise the configurations must be universal. These configurations must be specified at the helm level, and NOT by pipeline authors. Testing ------- In addition to the unit test, manually applied a Ray cluster yaml created with these new configurations to confirm the resulting pods had the expected annotations and labels. --------- Co-authored-by: Josh Bauer --- helm/sematic-server/templates/configmap.yaml | 4 ++ helm/sematic-server/values.yaml | 4 ++ sematic/plugins/kuberay_wrapper/standard.py | 57 ++++++++++++++++++- .../kuberay_wrapper/tests/test_standard.py | 33 ++++++++++- 4 files changed, 93 insertions(+), 5 deletions(-) diff --git a/helm/sematic-server/templates/configmap.yaml b/helm/sematic-server/templates/configmap.yaml index cb99c2eea..01aeae5ba 100644 --- a/helm/sematic-server/templates/configmap.yaml +++ b/helm/sematic-server/templates/configmap.yaml @@ -63,6 +63,10 @@ data: {{ end }} {{ if .Values.ray.enabled }} RAY_GPU_NODE_SELECTOR: {{ toJson .Values.ray.gpu_node_selector | quote }} + RAY_GPU_ANNOTATIONS: {{ toJson .Values.ray.gpu_annotations | quote }} + RAY_GPU_LABELS: {{ toJson .Values.ray.gpu_labels | quote }} + RAY_NON_GPU_ANNOTATIONS: {{ toJson .Values.ray.non_gpu_annotations | quote }} + RAY_NON_GPU_LABELS: {{ toJson .Values.ray.non_gpu_labels | quote }} RAY_NON_GPU_NODE_SELECTOR: {{ toJson .Values.ray.non_gpu_node_selector | quote }} RAY_GPU_TOLERATIONS: {{ toJson .Values.ray.gpu_tolerations | quote }} RAY_NON_GPU_TOLERATIONS: {{ toJson .Values.ray.non_gpu_tolerations | quote }} diff --git a/helm/sematic-server/values.yaml b/helm/sematic-server/values.yaml index a421bb228..9778bbe56 100644 --- a/helm/sematic-server/values.yaml +++ b/helm/sematic-server/values.yaml @@ -118,6 +118,10 @@ ray: supports_gpus: false gpu_node_selector: {} non_gpu_node_selector: {} + gpu_annotations: {} + non_gpu_annotations: {} + gpu_labels: {} + non_gpu_labels: {} gpu_tolerations: [] non_gpu_tolerations: [] gpu_resource_request_key: null diff --git a/sematic/plugins/kuberay_wrapper/standard.py b/sematic/plugins/kuberay_wrapper/standard.py index bba578d06..ae23a8ae0 100644 --- a/sematic/plugins/kuberay_wrapper/standard.py +++ b/sematic/plugins/kuberay_wrapper/standard.py @@ -5,7 +5,11 @@ # Sematic from sematic.abstract_plugin import AbstractPluginSettingsVar -from sematic.config.server_settings import ServerSettingsVar, get_server_setting +from sematic.config.server_settings import ( + ServerSettingsVar, + get_json_server_setting, + get_server_setting, +) from sematic.config.settings import get_plugin_setting from sematic.plugins.abstract_kuberay_wrapper import ( AbstractKuberayWrapper, @@ -36,6 +40,22 @@ class StandardKuberaySettingsVar(AbstractPluginSettingsVar): RAY_NON_GPU_TOLERATIONS: The Kubernetes tolerations that will be used for Ray nodes that don't use GPUs. Value should be json encoded into a string. + RAY_GPU_LABELS: + The Kubernetes labels that will be used for Ray nodes that use + GPUs. Value should be a json encoded object conntaining the + keys and values for the labels. + RAY_NON_GPU_LABELS: + The Kubernetes labels that will be used for Ray nodes that don't + use GPUs. Value should be a json encoded object conntaining the + keys and values for the labels. + RAY_GPU_ANNOTATIONS: + The Kubernetes annotations that will be used for Ray nodes that use + GPUs. Value should be a json encoded object conntaining the + keys and values for the annotations. + RAY_NON_GPU_ANNOTATIONS: + The Kubernetes annotations that will be used for Ray nodes that don't + use GPUs. Value should be a json encoded object conntaining the + keys and values for the annotations. RAY_GPU_RESOURCE_REQUEST_KEY: The key that will be used in the Kubernetes resource requests/ limits fields to indicate how many GPUs are required for Ray @@ -59,6 +79,10 @@ class StandardKuberaySettingsVar(AbstractPluginSettingsVar): RAY_GPU_RESOURCE_REQUEST_KEY = "RAY_GPU_RESOURCE_REQUEST_KEY" RAY_SUPPORTS_GPUS = "RAY_SUPPORTS_GPUS" RAY_BUSYBOX_PULL_OVERRIDE = "RAY_BUSYBOX_PULL_OVERRIDE" + RAY_GPU_LABELS = "RAY_GPU_LABELS" + RAY_NON_GPU_LABELS = "RAY_NON_GPU_LABELS" + RAY_GPU_ANNOTATIONS = "RAY_GPU_ANNOTATIONS" + RAY_NON_GPU_ANNOTATIONS = "RAY_NON_GPU_ANNOTATIONS" class _NeedsOverride: @@ -78,6 +102,7 @@ class _NeedsOverride: "groupName": _NeedsOverride, "rayStartParams": {"block": "true"}, "template": { + "metadata": {"labels": _NeedsOverride, "annotations": _NeedsOverride}, "spec": { "containers": [ { @@ -113,7 +138,7 @@ class _NeedsOverride: "serviceAccountName": _NeedsOverride, "nodeSelector": _NeedsOverride, "volumes": [{"name": "ray-logs", "emptyDir": {}}], - } + }, }, } @@ -152,7 +177,7 @@ class _NeedsOverride: "serviceType": "ClusterIP", "rayStartParams": {"dashboard-host": "0.0.0.0", "block": "true"}, "template": { - "metadata": {"labels": {}}, + "metadata": {"labels": _NeedsOverride, "annotations": _NeedsOverride}, "spec": { "containers": [ { @@ -284,6 +309,12 @@ def _make_worker_group_spec( StandardKuberaySettingsVar.RAY_BUSYBOX_PULL_OVERRIDE, _DEFAULT_BUSYBOX_PULL, ) + group_manifest["template"]["metadata"]["labels"] = cls._get_tags( + worker_group.worker_nodes, is_label=True + ) + group_manifest["template"]["metadata"]["annotations"] = cls._get_tags( + worker_group.worker_nodes, is_label=False + ) return group_manifest @@ -325,6 +356,20 @@ def _validate_ray_version(cls, ray_version: str) -> None: "Only ray versions 2.0 or higher are supported." ) + @classmethod + def _get_tags(cls, node_config: RayNodeConfig, is_label: bool) -> Dict[str, str]: + requires_gpu = node_config.gpu_count > 0 + settings_var = { + (False, False): StandardKuberaySettingsVar.RAY_NON_GPU_ANNOTATIONS, + (False, True): StandardKuberaySettingsVar.RAY_NON_GPU_LABELS, + (True, False): StandardKuberaySettingsVar.RAY_GPU_ANNOTATIONS, + (True, True): StandardKuberaySettingsVar.RAY_GPU_LABELS, + }[(requires_gpu, is_label)] + tags = get_json_server_setting(settings_var, {}) # type: ignore + if tags is None: + tags = {} + return tags + @classmethod def _make_head_group_spec( cls, @@ -351,6 +396,12 @@ def _make_head_group_spec( head_group_template["template"]["spec"][ "serviceAccountName" ] = _get_service_account() + head_group_template["template"]["metadata"]["labels"] = cls._get_tags( + node_config, is_label=True + ) + head_group_template["template"]["metadata"]["annotations"] = cls._get_tags( + node_config, is_label=False + ) return head_group_template diff --git a/sematic/plugins/kuberay_wrapper/tests/test_standard.py b/sematic/plugins/kuberay_wrapper/tests/test_standard.py index 51a5e597f..c29f8a753 100644 --- a/sematic/plugins/kuberay_wrapper/tests/test_standard.py +++ b/sematic/plugins/kuberay_wrapper/tests/test_standard.py @@ -107,7 +107,7 @@ "serviceType": "ClusterIP", "rayStartParams": {"dashboard-host": "0.0.0.0", "block": "true"}, "template": { - "metadata": {"labels": {}}, + "metadata": {"labels": {}, "annotations": {}}, "spec": { "containers": [ { @@ -152,6 +152,7 @@ "groupName": "worker-group-0", "rayStartParams": {"block": "true"}, "template": { + "metadata": {"labels": {}, "annotations": {}}, "spec": { "containers": [ { @@ -187,7 +188,7 @@ "serviceAccountName": "default", "nodeSelector": {}, "volumes": [{"name": "ray-logs", "emptyDir": {}}], - } + }, }, } @@ -304,6 +305,12 @@ def test_head_node_gpus(): gpu_node_selector = { "nvidia.com/gpu": "true", } + gpu_annotations = { + "annotation.com": "foo", + } + gpu_labels = { + "label": "bar", + } non_gpu_tolerations = [ dict( key="foo", @@ -324,6 +331,10 @@ def test_head_node_gpus(): StandardKuberaySettingsVar.RAY_GPU_NODE_SELECTOR.value: json.dumps( gpu_node_selector ), + StandardKuberaySettingsVar.RAY_GPU_ANNOTATIONS.value: json.dumps( + gpu_annotations + ), + StandardKuberaySettingsVar.RAY_GPU_LABELS.value: json.dumps(gpu_labels), StandardKuberaySettingsVar.RAY_NON_GPU_TOLERATIONS.value: json.dumps( non_gpu_tolerations ), @@ -355,6 +366,10 @@ def test_head_node_gpus(): assert manifest["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" ]["requests"] == {"cpu": "2000m", "memory": "4096Mi"} + assert manifest["spec"]["headGroupSpec"]["template"]["metadata"] == { + "annotations": gpu_annotations, + "labels": gpu_labels, + } assert ( manifest["spec"]["workerGroupSpecs"][0]["template"]["spec"]["nodeSelector"] == non_gpu_node_selector @@ -377,6 +392,12 @@ def test_worker_node_gpus(): expected_node_selector = { "nvidia.com/gpu": "true", } + gpu_annotations = { + "annotation.com": "foo", + } + gpu_labels = { + "label": "bar", + } with environment_variables( { StandardKuberaySettingsVar.RAY_SUPPORTS_GPUS.value: "true", @@ -389,6 +410,10 @@ def test_worker_node_gpus(): StandardKuberaySettingsVar.RAY_GPU_RESOURCE_REQUEST_KEY.value: json.dumps( "nvidia.com/gpu" ), + StandardKuberaySettingsVar.RAY_GPU_ANNOTATIONS.value: json.dumps( + gpu_annotations + ), + StandardKuberaySettingsVar.RAY_GPU_LABELS.value: json.dumps(gpu_labels), } ): manifest = StandardKuberayWrapper.create_cluster_manifest( # type: ignore @@ -421,6 +446,10 @@ def test_worker_node_gpus(): assert manifest["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ "resources" ]["requests"] == {"cpu": "1000m", "nvidia.com/gpu": 2, "memory": "2048Mi"} + assert manifest["spec"]["workerGroupSpecs"][0]["template"]["metadata"] == { + "labels": gpu_labels, + "annotations": gpu_annotations, + } def test_custom_service_account(): From d24916aac00515c7390825d2683b30a77e7384a2 Mon Sep 17 00:00:00 2001 From: augray Date: Tue, 15 Oct 2024 16:39:41 -0400 Subject: [PATCH 2/3] Add can_cast for tuple (#1132) Apparently `tuple` was missing an implementation for `can_cast_type`. It likely went unnoticed because `can_cast` has a quick check to return `True` if the source and destination types are the same python object (`a is b` check), which would work for most cases. But for other cases you might want it to work, the implementation was missing & causing problems. This PR adds an implementation. Co-authored-by: Josh Bauer --- sematic/types/types/tests/test_tuple.py | 33 +++++++++++++++++- sematic/types/types/tuple.py | 45 ++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/sematic/types/types/tests/test_tuple.py b/sematic/types/types/tests/test_tuple.py index 2de7301d3..6061082f0 100644 --- a/sematic/types/types/tests/test_tuple.py +++ b/sematic/types/types/tests/test_tuple.py @@ -6,7 +6,7 @@ import pytest # Sematic -from sematic.types.casting import safe_cast +from sematic.types.casting import can_cast_type, safe_cast from sematic.types.serialization import ( get_json_encodable_summary, type_from_json_encodable, @@ -44,6 +44,37 @@ def test_tuple(value, type_, expected_value, expected_error): assert cast_value == expected_value +@pytest.mark.parametrize( + "from_type, to_type, expected_error", + ( + (Tuple[float, int], Tuple[float, int], None), + (Tuple[float, int], Tuple[float, float], None), + (Tuple[int, str], Tuple[float, str], None), + (Tuple[int, str], int, "Cannot cast typing.Tuple[int, str] to int"), + ( + Tuple[int, str], + Tuple[int, str, int], + "Can't cast typing.Tuple[int, str] to typing.Tuple[int, str, int]: " + "they have different arities (2 vs 3)", + ), + ( + Tuple[int, str], + Tuple[int, int], + "Can't cast typing.Tuple[int, str] to typing.Tuple[int, int]:: " + "Cannot cast to int", + ), + ), +) +def test_can_cast_tuple(from_type, to_type, expected_error): + can_cast, error = can_cast_type(from_type, to_type) + if expected_error is None: + assert error is None + assert can_cast + else: + assert not can_cast + assert error == expected_error + + def test_summary(): summary, blobs = get_json_encodable_summary(("foo", 42), Tuple[str, float]) diff --git a/sematic/types/types/tuple.py b/sematic/types/types/tuple.py index 78d4494e9..960c88673 100644 --- a/sematic/types/types/tuple.py +++ b/sematic/types/types/tuple.py @@ -1,10 +1,12 @@ # Standard Library +import typing from typing import Any, Iterable, List, Optional, Tuple, Type # Sematic -from sematic.types.casting import safe_cast +from sematic.types.casting import can_cast_type, safe_cast from sematic.types.registry import ( SummaryOutput, + register_can_cast, register_from_json_encodable, register_safe_cast, register_to_json_encodable, @@ -88,3 +90,44 @@ def _tuple_to_json_encodable_summary(value: Tuple, type_: Type) -> SummaryOutput blobs.update(element_blobs) return summary, blobs + + +# Using `tuple` instead of `typing.Tuple` here because +# `typing.Tuple[T].__origin__` is `tuple` +@register_can_cast(tuple) +def can_cast_to_tuple(from_type: typing.Any, to_type: typing.Any): + """ + Type casting logic for `Tuple[T, U, ]`. + + `from_type` and `to_type` should be subscripted generics + of the form `Tuple[T, ]`. + + A type of the form `Tuple[T, ]` is castable to `Tuple[U, ]` if the + two tuples have the same arity and the types in the second tuple can all be casted + to the corresponding types in the first. + + For example `Tuple[int, float]` is castable to `Tuple[float, float]`, but + `Tuple[int, str]` is not. + """ + err_prefix = "Can't cast {} to {}:".format(from_type, to_type) + + from_args = typing.get_args(from_type) + if len(from_args) < 1: + return False, "{} not a subscripted generic".format(err_prefix) + + to_args = typing.get_args(to_type) + if len(from_args) != len(to_args): + return False, "{} they have different arities ({} vs {})".format( + err_prefix, len(from_args), len(to_args) + ) + + from_origin = typing.get_origin(from_type) + if not (from_origin is not None and issubclass(from_origin, tuple)): + return False, "{} not a tuple".format(err_prefix) + + for from_t, to_t in zip(from_args, to_args): + can_cast, error = can_cast_type(from_t, to_t) + if can_cast is False: + return False, "{}: {}".format(err_prefix, error) + + return True, None From dc3d3e3dca78fd122acb1ca75d15345cec5a7503 Mon Sep 17 00:00:00 2001 From: augray Date: Wed, 16 Oct 2024 11:08:22 -0400 Subject: [PATCH 3/3] Release v0.40.0 (#1133) Release v0.40.0 Co-authored-by: Josh Bauer --- README.md | 2 +- README.rst | 4 ++-- docs/changelog.md | 3 +++ helm/sematic-server/Chart.yaml | 4 ++-- sematic/versions.py | 2 +- sematic/wheel_constants.bzl | 2 +- 6 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index fbfb05317..dbc00f9ed 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@

Build ML pipelines with only Python, run on your laptop, or in the cloud.

-![PyPI](https://img.shields.io/pypi/v/sematic/0.39.1?style=for-the-badge) +![PyPI](https://img.shields.io/pypi/v/sematic/0.40.0?style=for-the-badge) [![CircleCI](https://img.shields.io/circleci/build/github/sematic-ai/sematic/main?label=CircleCI&style=for-the-badge&token=60d1953bfee5b6bf8201f8e84a10eaa5bf5622fe)](https://app.circleci.com/pipelines/github/sematic-ai/sematic?branch=main&filter=all) ![PyPI - License](https://img.shields.io/pypi/l/sematic?style=for-the-badge) [![Python 3.8](https://img.shields.io/badge/Python-3.8-blue?style=for-the-badge&logo=none)](https://python.org) diff --git a/README.rst b/README.rst index a964dfde3..f2760cfa4 100644 --- a/README.rst +++ b/README.rst @@ -6,8 +6,8 @@ -.. image:: https://img.shields.io/pypi/v/sematic/0.39.1?style=for-the-badge - :target: https://img.shields.io/pypi/v/sematic/0.39.1?style=for-the-badge +.. image:: https://img.shields.io/pypi/v/sematic/0.40.0?style=for-the-badge + :target: https://img.shields.io/pypi/v/sematic/0.40.0?style=for-the-badge :alt: PyPI diff --git a/docs/changelog.md b/docs/changelog.md index 65595da51..93ab84d80 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,9 @@ Lines for version numbers should always be formatted as with nothing else on the line. --> * HEAD +* [0.40.0](https://pypi.org/project/sematic/0.40.0/) + * [feature] Allow custom labels and annotations for Ray integration + * [bugfix] Fix an issue with tuple type casting checks * [0.39.1](https://pypi.org/project/sematic/0.39.1/) * [bugfix] Fix DB migrations from clean installs after SQLAlchemy upgrade * [0.39.0](https://pypi.org/project/sematic/0.39.0/) diff --git a/helm/sematic-server/Chart.yaml b/helm/sematic-server/Chart.yaml index 50a68fa15..6000f8634 100644 --- a/helm/sematic-server/Chart.yaml +++ b/helm/sematic-server/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: sematic-server description: Sematic AI Server type: application -version: 1.1.23 -appVersion: v0.39.1 +version: 1.2.0 +appVersion: v0.40.0 maintainers: - name: sematic-ai url: https://github.com/sematic-ai/sematic/ diff --git a/sematic/versions.py b/sematic/versions.py index a67fcacdb..d6d97902b 100644 --- a/sematic/versions.py +++ b/sematic/versions.py @@ -9,7 +9,7 @@ # the sdk. Should be bumped any time a release is made. Should be set # to whatever is the version after the most recent one in changelog.md, # as well as the version for the sematic wheel in wheel_constants.bzl -CURRENT_VERSION = (0, 39, 1) +CURRENT_VERSION = (0, 40, 0) # TO DEPRECATE # 0.X.X: diff --git a/sematic/wheel_constants.bzl b/sematic/wheel_constants.bzl index e8f385415..c7e236440 100644 --- a/sematic/wheel_constants.bzl +++ b/sematic/wheel_constants.bzl @@ -2,7 +2,7 @@ # changelog.md. # This is the version that will be attached to the # wheel that bazel builds for sematic. -wheel_version_string = "0.39.1" +wheel_version_string = "0.40.0" wheel_author = "Sematic AI, Inc." wheel_author_email = "support@sematic.dev"