Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions examples/clusters_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,27 @@ def create_cluster_example():
# Get SSH keys
ssh_keys = [key.id for key in verda.ssh_keys.get()]

cluster_type = '16B200'
cluster_image = 'ubuntu-24.04-cuda-13.0-cluster'
location_code = Locations.FIN_03

# Check if cluster type is available
if not verda.clusters.is_available('16B200', Locations.FIN_03):
raise ValueError('Cluster type 16B200 is not available in FIN_03')
if not verda.clusters.is_available(cluster_type, location_code):
raise ValueError(f'Cluster type {cluster_type} is not available in {location_code}')

# Get available images for cluster type
images = verda.clusters.get_cluster_images('16B200')
if 'ubuntu-22.04-cuda-12.9-cluster' not in images:
raise ValueError('Ubuntu 22.04 CUDA 12.9 cluster image is not supported for 16B200')
images = verda.clusters.get_cluster_images(cluster_type)
if cluster_image not in images:
raise ValueError(f'Cluster image {cluster_image} is not supported for {cluster_type}')

# Create a 16B200 cluster
# Create a cluster
cluster = verda.clusters.create(
hostname='my-compute-cluster',
cluster_type='16B200',
image='ubuntu-22.04-cuda-12.9-cluster',
cluster_type=cluster_type,
image=cluster_image,
description='Example compute cluster for distributed training',
ssh_key_ids=ssh_keys,
location=Locations.FIN_03,
location=location_code,
shared_volume_name='my-shared-volume',
shared_volume_size=30000,
wait_for_status=None,
Expand All @@ -59,8 +63,8 @@ def create_cluster_example():

# Wait for cluster to enter RUNNING status
while cluster.status != ClusterStatus.RUNNING:
time.sleep(30)
print(f'Waiting for cluster to enter RUNNING status... (status: {cluster.status})')
time.sleep(3)
cluster = verda.clusters.get_by_id(cluster.id)

print(f'Public IP: {cluster.ip}')
Expand Down Expand Up @@ -100,7 +104,13 @@ def get_cluster_by_id_example(cluster_id: str):
print(f' Created at: {cluster.created_at}')
print(f' Public IP: {cluster.ip}')
print(f' Worker nodes: {len(cluster.worker_nodes)}')

for node in cluster.worker_nodes:
print(f' - {node.hostname} ({node.id}): {node.status}, private IP: {node.private_ip}')
print(f' Shared volumes: {len(cluster.shared_volumes)}')
for volume in cluster.shared_volumes:
print(
f' - {volume.name} ({volume.id}): {volume.size_in_gigabytes} GB, mounted at {volume.mount_point}'
)
return cluster


Expand Down
23 changes: 22 additions & 1 deletion verda/clusters/_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from verda.constants import Actions, ClusterStatus, ErrorCodes, Locations
from verda.exceptions import APIException
from verda.http_client import HTTPClient

CLUSTERS_ENDPOINT = '/clusters'

Expand All @@ -31,6 +32,24 @@ class ClusterWorkerNode:
private_ip: str


@dataclass_json
@dataclass
class SharedVolume:
"""Represents a shared volume in a cluster.

Attributes:
id: Unique identifier for the volume.
name: Name of the volume.
size_in_gigabytes: Size of the volume in gigabytes.
mount_point: Mount point of the volume.
"""

id: str
name: str
size_in_gigabytes: int
mount_point: str | None = None


@dataclass_json
@dataclass
class Cluster:
Expand Down Expand Up @@ -59,7 +78,9 @@ class Cluster:
location: str
cluster_type: str
worker_nodes: list[ClusterWorkerNode]
shared_volumes: list[SharedVolume]
ssh_key_ids: list[str]

image: str | None = None
startup_script_id: str | None = None
ip: str | None = None
Expand All @@ -71,7 +92,7 @@ class ClustersService:
This service provides methods to create, retrieve, and manage compute clusters.
"""

def __init__(self, http_client) -> None:
def __init__(self, http_client: HTTPClient) -> None:
"""Initializes the ClustersService with an HTTP client.

Args:
Expand Down