Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions libs/python/computer-server/computer_server/diorama/diorama.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,39 @@
automation_handler = MacOSAutomationHandler()

class Diorama:
"""Virtual desktop manager that provides automation capabilities for macOS applications.

Manages application windows and provides an interface for taking screenshots,
mouse interactions, keyboard input, and coordinate transformations between
screenshot space and screen space.
"""
_scheduler_queue = None
_scheduler_task = None
_loop = None
_scheduler_started = False

@classmethod
def create_from_apps(cls, *args) -> DioramaComputer:
"""Create a DioramaComputer instance from a list of application names.

Args:
*args: Variable number of application names to include in the desktop

Returns:
DioramaComputer: A computer interface for the specified applications
"""
cls._ensure_scheduler()
return cls(args).computer

# Dictionary to store cursor positions for each unique app_list hash
_cursor_positions = {}

def __init__(self, app_list):
"""Initialize a Diorama instance for the specified applications.

Args:
app_list: List of application names to manage
"""
self.app_list = app_list
self.interface = self.Interface(self)
self.computer = DioramaComputer(self)
Expand All @@ -48,6 +67,10 @@ def __init__(self, app_list):

@classmethod
def _ensure_scheduler(cls):
"""Ensure the async scheduler loop is running.

Creates and starts the scheduler task if it hasn't been started yet.
"""
if not cls._scheduler_started:
logger.info("Starting Diorama scheduler loop…")
cls._scheduler_queue = asyncio.Queue()
Expand All @@ -57,6 +80,11 @@ def _ensure_scheduler(cls):

@classmethod
async def _scheduler_loop(cls):
"""Main scheduler loop that processes automation commands.

Continuously processes commands from the scheduler queue, handling
screenshots, mouse actions, keyboard input, and scrolling operations.
"""
while True:
cmd = await cls._scheduler_queue.get()
action = cmd.get("action")
Expand Down Expand Up @@ -144,13 +172,33 @@ async def _scheduler_loop(cls):
future.set_exception(e)

class Interface():
"""Interface for interacting with the virtual desktop.

Provides methods for taking screenshots, mouse interactions, keyboard input,
and coordinate transformations between screenshot and screen coordinates.
"""

def __init__(self, diorama):
"""Initialize the interface with a reference to the parent Diorama instance.

Args:
diorama: The parent Diorama instance
"""
self._diorama = diorama

self._scene_hitboxes = []
self._scene_size = None

async def _send_cmd(self, action, arguments=None):
"""Send a command to the scheduler queue.

Args:
action (str): The action to perform
arguments (dict, optional): Arguments for the action

Returns:
The result of the command execution
"""
Diorama._ensure_scheduler()
loop = asyncio.get_event_loop()
future = loop.create_future()
Expand All @@ -167,6 +215,14 @@ async def _send_cmd(self, action, arguments=None):
return None

async def screenshot(self, as_bytes: bool = True) -> Union[str, Image.Image]:
"""Take a screenshot of the managed applications.

Args:
as_bytes (bool): If True, return base64-encoded bytes; if False, return PIL Image

Returns:
Union[str, Image.Image]: Base64-encoded PNG bytes or PIL Image object
"""
import base64
result, img = await self._send_cmd("screenshot")
self._scene_hitboxes = result.get("hitboxes", [])
Expand All @@ -184,6 +240,12 @@ async def screenshot(self, as_bytes: bool = True) -> Union[str, Image.Image]:
return img

async def left_click(self, x, y):
"""Perform a left mouse click at the specified coordinates.

Args:
x (int): X coordinate in screenshot space (or None to use last position)
y (int): Y coordinate in screenshot space (or None to use last position)
"""
# Get last cursor position for this app_list hash
app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
Expand All @@ -195,6 +257,12 @@ async def left_click(self, x, y):
await self._send_cmd("left_click", {"x": sx, "y": sy})

async def right_click(self, x, y):
"""Perform a right mouse click at the specified coordinates.

Args:
x (int): X coordinate in screenshot space (or None to use last position)
y (int): Y coordinate in screenshot space (or None to use last position)
"""
# Get last cursor position for this app_list hash
app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
Expand All @@ -206,6 +274,12 @@ async def right_click(self, x, y):
await self._send_cmd("right_click", {"x": sx, "y": sy})

async def double_click(self, x, y):
"""Perform a double mouse click at the specified coordinates.

Args:
x (int): X coordinate in screenshot space (or None to use last position)
y (int): Y coordinate in screenshot space (or None to use last position)
"""
# Get last cursor position for this app_list hash
app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
Expand All @@ -217,6 +291,12 @@ async def double_click(self, x, y):
await self._send_cmd("double_click", {"x": sx, "y": sy})

async def move_cursor(self, x, y):
"""Move the mouse cursor to the specified coordinates.

Args:
x (int): X coordinate in screenshot space (or None to use last position)
y (int): Y coordinate in screenshot space (or None to use last position)
"""
# Get last cursor position for this app_list hash
app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
Expand All @@ -228,6 +308,13 @@ async def move_cursor(self, x, y):
await self._send_cmd("move_cursor", {"x": sx, "y": sy})

async def drag_to(self, x, y, duration=0.5):
"""Drag the mouse from current position to the specified coordinates.

Args:
x (int): X coordinate in screenshot space (or None to use last position)
y (int): Y coordinate in screenshot space (or None to use last position)
duration (float): Duration of the drag operation in seconds
"""
# Get last cursor position for this app_list hash
app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
Expand All @@ -239,18 +326,43 @@ async def drag_to(self, x, y, duration=0.5):
await self._send_cmd("drag_to", {"x": sx, "y": sy, "duration": duration})

async def get_cursor_position(self):
"""Get the current cursor position in screen coordinates.

Returns:
tuple: (x, y) coordinates of the cursor in screen space
"""
return await self._send_cmd("get_cursor_position")

async def type_text(self, text):
"""Type the specified text using the keyboard.

Args:
text (str): The text to type
"""
await self._send_cmd("type_text", {"text": text})

async def press_key(self, key):
"""Press a single key on the keyboard.

Args:
key (str): The key to press
"""
await self._send_cmd("press_key", {"key": key})

async def hotkey(self, keys):
"""Press a combination of keys simultaneously.

Args:
keys (list): List of keys to press together
"""
await self._send_cmd("hotkey", {"keys": list(keys)})

async def scroll_up(self, clicks: int = 1):
"""Scroll up at the current cursor position.

Args:
clicks (int): Number of scroll clicks to perform
"""
# Get last cursor position for this app_list hash
app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
Expand All @@ -259,6 +371,11 @@ async def scroll_up(self, clicks: int = 1):
await self._send_cmd("scroll_up", {"clicks": clicks, "x": x, "y": y})

async def scroll_down(self, clicks: int = 1):
"""Scroll down at the current cursor position.

Args:
clicks (int): Number of scroll clicks to perform
"""
# Get last cursor position for this app_list hash
app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
Expand All @@ -267,6 +384,11 @@ async def scroll_down(self, clicks: int = 1):
await self._send_cmd("scroll_down", {"clicks": clicks, "x": x, "y": y})

async def get_screen_size(self) -> dict[str, int]:
"""Get the size of the screenshot area.

Returns:
dict[str, int]: Dictionary with 'width' and 'height' keys
"""
if not self._scene_size:
await self.screenshot()
return { "width": self._scene_size[0], "height": self._scene_size[1] }
Expand Down Expand Up @@ -348,6 +470,7 @@ async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, fl
import time

async def main():
"""Main function demonstrating Diorama usage with multiple desktops and mouse tracking."""
desktop1 = Diorama.create_from_apps(["Discord", "Notes"])
desktop2 = Diorama.create_from_apps(["Terminal"])

Expand Down
Loading
Loading