From 6c026cece471b8bfb7d2f77f01954ce068ee88bd Mon Sep 17 00:00:00 2001 From: hassineabd Date: Mon, 29 Dec 2025 14:59:51 +0100 Subject: [PATCH 01/19] filtering strategy android - clean architecture --- Agent/platforms/filters/__init__.py | 7 ++++ Agent/platforms/filters/android/__init__.py | 29 ++++++++++++++ Agent/platforms/filters/android/bounds.py | 34 +++++++++++++++++ Agent/platforms/filters/android/displayed.py | 9 +++++ .../platforms/filters/android/interactive.py | 38 +++++++++++++++++++ Agent/platforms/filters/pipeline.py | 18 +++++++++ 6 files changed, 135 insertions(+) create mode 100644 Agent/platforms/filters/__init__.py create mode 100644 Agent/platforms/filters/android/__init__.py create mode 100644 Agent/platforms/filters/android/bounds.py create mode 100644 Agent/platforms/filters/android/displayed.py create mode 100644 Agent/platforms/filters/android/interactive.py create mode 100644 Agent/platforms/filters/pipeline.py diff --git a/Agent/platforms/filters/__init__.py b/Agent/platforms/filters/__init__.py new file mode 100644 index 0000000..b885f85 --- /dev/null +++ b/Agent/platforms/filters/__init__.py @@ -0,0 +1,7 @@ +from Agent.platforms.filters.pipeline import FilterPipeline +from Agent.platforms.filters.android import AndroidFilterPipeline + +__all__ = [ + 'FilterPipeline', + 'AndroidFilterPipeline', +] diff --git a/Agent/platforms/filters/android/__init__.py b/Agent/platforms/filters/android/__init__.py new file mode 100644 index 0000000..1928e9d --- /dev/null +++ b/Agent/platforms/filters/android/__init__.py @@ -0,0 +1,29 @@ +from typing import Any, Dict, List +from Agent.platforms.filters.android.displayed import DisplayedFilter +from Agent.platforms.filters.android.bounds import BoundsFilter +from Agent.platforms.filters.android.interactive import InteractiveFilter +from Agent.platforms.filters.pipeline import FilterPipeline + + +class AndroidFilterPipeline: + """Pre-configured filter pipeline for Android elements.""" + + def __init__(self, screen_size: Dict[str, int] = None): + screen_size = screen_size or {} + self._pipeline = FilterPipeline([ + DisplayedFilter(), + BoundsFilter(screen_size.get('width', 0), screen_size.get('height', 0)), + InteractiveFilter(), + ]) + + def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return self._pipeline.apply(elements) + + +__all__ = [ + 'DisplayedFilter', + 'BoundsFilter', + 'InteractiveFilter', + 'AndroidFilterPipeline', +] + diff --git a/Agent/platforms/filters/android/bounds.py b/Agent/platforms/filters/android/bounds.py new file mode 100644 index 0000000..c1cc6ee --- /dev/null +++ b/Agent/platforms/filters/android/bounds.py @@ -0,0 +1,34 @@ +from typing import Any, Dict, List + + +class BoundsFilter: + """Keep Android elements with valid bounds that intersect the screen.""" + + def __init__(self, screen_width: int = 0, screen_height: int = 0): + self._screen_width = screen_width + self._screen_height = screen_height + + def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + result = [] + for e in elements: + bbox = e.get('bbox', {}) + if not bbox: + continue + + x = bbox.get('x', 0) + y = bbox.get('y', 0) + w = bbox.get('width', 0) + h = bbox.get('height', 0) + + if w <= 0 or h <= 0: + continue + + if self._screen_width > 0 and self._screen_height > 0: + if x + w < 0 or y + h < 0: + continue + if x > self._screen_width or y > self._screen_height: + continue + + result.append(e) + return result + diff --git a/Agent/platforms/filters/android/displayed.py b/Agent/platforms/filters/android/displayed.py new file mode 100644 index 0000000..5ab4c0f --- /dev/null +++ b/Agent/platforms/filters/android/displayed.py @@ -0,0 +1,9 @@ +from typing import Any, Dict, List + + +class DisplayedFilter: + """Keep only displayed Android elements.""" + + def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return [e for e in elements if e.get('displayed') == 'true'] + diff --git a/Agent/platforms/filters/android/interactive.py b/Agent/platforms/filters/android/interactive.py new file mode 100644 index 0000000..291269e --- /dev/null +++ b/Agent/platforms/filters/android/interactive.py @@ -0,0 +1,38 @@ +from typing import Any, Dict, List, Set + + +class InteractiveFilter: + """Keep Android elements that are likely interactive.""" + + INTERACTIVE_CLASSES: Set[str] = { + 'Button', 'ImageButton', 'EditText', 'TextView', 'CheckBox', + 'RadioButton', 'Switch', 'ToggleButton', 'Spinner', 'SeekBar', + 'ImageView', 'FloatingActionButton', 'Chip', 'Tab', + } + + def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return [e for e in elements if self._is_interactive(e)] + + def _is_interactive(self, e: Dict[str, Any]) -> bool: + if e.get('clickable') == 'true': + return True + if e.get('focusable') == 'true': + return True + if e.get('scrollable') == 'true': + return True + + text = e.get('text', '') + if text and str(text).strip(): + return True + + content_desc = e.get('content-desc', '') + if content_desc and str(content_desc).strip(): + return True + + class_name = e.get('class', '') + for interactive_class in self.INTERACTIVE_CLASSES: + if interactive_class in class_name: + return True + + return False + diff --git a/Agent/platforms/filters/pipeline.py b/Agent/platforms/filters/pipeline.py new file mode 100644 index 0000000..75daef9 --- /dev/null +++ b/Agent/platforms/filters/pipeline.py @@ -0,0 +1,18 @@ +from typing import Any, Dict, List + + +class FilterPipeline: + """Composable pipeline of filters.""" + + def __init__(self, filters: List = None): + self._filters = filters or [] + + def add(self, f) -> 'FilterPipeline': + self._filters.append(f) + return self + + def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + for f in self._filters: + elements = f.apply(elements) + return elements + From 312828ed90baf348e1aee0c462636da4b74f6302 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Mon, 29 Dec 2025 15:01:47 +0100 Subject: [PATCH 02/19] collecting ui element separated - android --- Agent/platforms/collectors/__init__.py | 14 +- .../platforms/collectors/android_collector.py | 57 ++++++++ Agent/platforms/collectors/base_collector.py | 57 -------- .../platforms/collectors/collector_factory.py | 48 ------- Agent/platforms/collectors/ios_collector.py | 11 ++ Agent/platforms/collectors/xml_collector.py | 125 ------------------ 6 files changed, 74 insertions(+), 238 deletions(-) create mode 100644 Agent/platforms/collectors/android_collector.py delete mode 100644 Agent/platforms/collectors/base_collector.py delete mode 100644 Agent/platforms/collectors/collector_factory.py create mode 100644 Agent/platforms/collectors/ios_collector.py delete mode 100644 Agent/platforms/collectors/xml_collector.py diff --git a/Agent/platforms/collectors/__init__.py b/Agent/platforms/collectors/__init__.py index 9d0a93e..0d6b36b 100644 --- a/Agent/platforms/collectors/__init__.py +++ b/Agent/platforms/collectors/__init__.py @@ -1,19 +1,17 @@ """ UI Collectors for mobile automation. -This module provides strategies for collecting UI elements: -- XMLCollector: XML page source parsing (Android/iOS) +- AndroidCollector: Android XML page source parsing +- IOSCollector: iOS XML page source parsing (NotImplemented) """ -from Agent.platforms.collectors.base_collector import BaseUICollector -from Agent.platforms.collectors.collector_factory import CollectorRegistry -from Agent.platforms.collectors.xml_collector import XMLCollector +from Agent.platforms.collectors.android_collector import AndroidCollector +from Agent.platforms.collectors.ios_collector import IOSCollector from Agent.platforms.collectors.som_renderer import render_som, bbox_center __all__ = [ - 'BaseUICollector', - 'CollectorRegistry', - 'XMLCollector', + 'AndroidCollector', + 'IOSCollector', 'render_som', 'bbox_center', ] diff --git a/Agent/platforms/collectors/android_collector.py b/Agent/platforms/collectors/android_collector.py new file mode 100644 index 0000000..6c8dc56 --- /dev/null +++ b/Agent/platforms/collectors/android_collector.py @@ -0,0 +1,57 @@ +from typing import Any, Dict, List +import xml.etree.ElementTree as ET + + +class AndroidCollector: + """Collects UI elements from Android XML page source.""" + + def get_name(self) -> str: + return "android" + + def parse_xml(self, xml_source: str) -> List[Dict[str, Any]]: + """ + Args: + xml_source: Appium page source XML + Returns: + List of element dicts with raw XML attributes + computed bbox + """ + root = ET.fromstring(xml_source) + elements = [] + + def walk(node: Any) -> None: + attrs = self._parse_node(node) + elements.append(attrs) + for child in node: + walk(child) + + walk(root) + return elements + + def _parse_node(self, node: Any) -> Dict[str, Any]: + raw_attrs = dict(node.attrib) + + bounds_str = raw_attrs.get('bounds', '') + bbox = self._parse_bounds(bounds_str) + + return { + **raw_attrs, + 'bbox': bbox, + } + + def _parse_bounds(self, bounds_str: str) -> Dict[str, int]: + """ + Args: + bounds_str: "[0,72][1080,200]" + Returns: + {'x': 0, 'y': 72, 'width': 1080, 'height': 128} + """ + if not bounds_str: + return {} + try: + parts = bounds_str.replace('][', ',').strip('[]').split(',') + if len(parts) == 4: + x1, y1, x2, y2 = map(int, parts) + return {'x': x1, 'y': y1, 'width': x2 - x1, 'height': y2 - y1} + except (ValueError, AttributeError): + pass + return {} diff --git a/Agent/platforms/collectors/base_collector.py b/Agent/platforms/collectors/base_collector.py deleted file mode 100644 index c9603d6..0000000 --- a/Agent/platforms/collectors/base_collector.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Base abstract class for UI element collectors. - -All collector strategies must inherit from BaseUICollector and implement -the required methods. -""" - -from abc import ABC, abstractmethod -from typing import Any, Dict, List - - -class BaseUICollector(ABC): - """ - Abstract base class for UI element collection strategies. - - Each collector strategy must implement: - 1. collect_elements() - to gather UI elements from the page - 2. get_name() - to identify the strategy - """ - - @abstractmethod - def collect_elements(self, max_items: int = 500) -> List[Dict[str, Any]]: - """ - Collect interactive UI elements from the current page. - - Args: - max_items: Maximum number of elements to return - - Returns: - List of dictionaries with element attributes: - { - 'text': str, # Visible text - 'resource_id': str, # ID or test-id - 'content_desc': str, # aria-label or placeholder - 'label': str, # Associated label text - 'class_name': str, # Tag name (button, input, etc.) - 'role': str, # ARIA role - 'name': str, # name attribute - 'type': str, # input type - 'href': str, # href for links - 'clickable': bool, # Is element clickable - 'enabled': bool, # Is element enabled - 'bbox': dict # Bounding box {'x': int, 'y': int, 'width': int, 'height': int} - } - """ - pass - - @abstractmethod - def get_name(self) -> str: - """ - Return the name of this collector strategy. - - Returns: - String identifier (e.g., "js_query") - """ - pass - diff --git a/Agent/platforms/collectors/collector_factory.py b/Agent/platforms/collectors/collector_factory.py deleted file mode 100644 index 1259391..0000000 --- a/Agent/platforms/collectors/collector_factory.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Factory and Registry for UI Collectors. -""" - -from typing import Dict, List, Type -from robot.api import logger -from Agent.platforms.collectors.base_collector import BaseUICollector - - -class CollectorRegistry: - """Registry for UI collector strategies.""" - - _collectors: Dict[str, Type[BaseUICollector]] = {} - - @classmethod - def register(cls, name: str, collector_class: Type[BaseUICollector]) -> None: - if not issubclass(collector_class, BaseUICollector): - raise TypeError(f"{collector_class} must inherit from BaseUICollector") - cls._collectors[name] = collector_class - logger.debug(f"Registered UI collector: '{name}' -> {collector_class.__name__}") - - @classmethod - def create(cls, strategy: str) -> BaseUICollector: - if strategy not in cls._collectors: - available = cls.list_available() - raise ValueError(f"Unknown strategy: '{strategy}'. Available: {available}") - - collector_class = cls._collectors[strategy] - return collector_class() - - @classmethod - def list_available(cls) -> List[str]: - return list(cls._collectors.keys()) - - @classmethod - def is_registered(cls, strategy: str) -> bool: - return strategy in cls._collectors - - -def _register_builtin_collectors(): - try: - from Agent.platforms.collectors.xml_collector import XMLCollector - CollectorRegistry.register("xml", XMLCollector) - except ImportError as e: - logger.warn(f"Could not register XMLCollector: {e}") - - -_register_builtin_collectors() diff --git a/Agent/platforms/collectors/ios_collector.py b/Agent/platforms/collectors/ios_collector.py new file mode 100644 index 0000000..6dd7fdc --- /dev/null +++ b/Agent/platforms/collectors/ios_collector.py @@ -0,0 +1,11 @@ +from typing import Any, Dict, List + + +class IOSCollector: + """Collects UI elements from iOS XML page source.""" + + def get_name(self) -> str: + return "ios" + + def parse_xml(self, xml_source: str) -> List[Dict[str, Any]]: + raise NotImplementedError("iOS collector not implemented yet") diff --git a/Agent/platforms/collectors/xml_collector.py b/Agent/platforms/collectors/xml_collector.py deleted file mode 100644 index d4e0ec0..0000000 --- a/Agent/platforms/collectors/xml_collector.py +++ /dev/null @@ -1,125 +0,0 @@ - - -from typing import Any, Dict, List -import xml.etree.ElementTree as ET -from robot.api import logger -from Agent.platforms.collectors.base_collector import BaseUICollector - - -class XMLCollector(BaseUICollector): - """ - Collects UI elements by parsing Appium XML page source. - - Supports both Android and iOS XML formats. - """ - - def __init__(self, platform: str = "android"): - self._platform = platform - - def get_name(self) -> str: - return "xml" - - def set_platform(self, platform: str) -> None: - self._platform = platform - - def collect_elements(self, max_items: int = 50) -> List[Dict[str, Any]]: - raise NotImplementedError("Use parse_xml() with XML source instead") - - def parse_xml(self, xml_source: str, max_items: int = 50) -> List[Dict[str, Any]]: - """ - Parse XML page source to extract interactive elements. - - Args: - xml_source: Appium page source XML string - max_items: Maximum elements to return - Returns: - List of element dictionaries - """ - root = ET.fromstring(xml_source) - candidates = [] - - def walk(node: Any) -> None: - if self._platform == 'ios': - attrs = self._parse_ios_node(node) - else: - attrs = self._parse_android_node(node) - - if attrs['clickable'] and attrs['enabled']: - candidates.append(attrs) - - for child in node: - walk(child) - - walk(root) - - candidates.sort( - key=lambda x: ( - bool(x.get('text')), - bool(x.get('accessibility_label')), - bool(x.get('resource_id')) - ), - reverse=True - ) - - logger.debug(f"[{self.get_name()}] Platform: {self._platform}, Found {len(candidates)} interactive elements") - return candidates[:max_items] - - def _parse_android_node(self, node: Any) -> Dict[str, Any]: - """Parse Android XML node to element dict.""" - bbox = self._parse_android_bounds(node.get('bounds', '')) - content_desc = node.get('content-desc', '') - return { - 'text': node.get('text', ''), - 'resource_id': node.get('resource-id', ''), - 'class_name': node.get('class', ''), - 'accessibility_label': content_desc, - 'content_desc': content_desc, # backward compat - 'clickable': node.get('clickable', 'false') == 'true', - 'enabled': node.get('enabled', 'false') == 'true', - 'bbox': bbox, - } - - def _parse_ios_node(self, node: Any) -> Dict[str, Any]: - """Parse iOS XML node to element dict.""" - try: - bbox = { - 'x': int(node.get('x', 0)), - 'y': int(node.get('y', 0)), - 'width': int(node.get('width', 0)), - 'height': int(node.get('height', 0)), - } - except (ValueError, TypeError): - bbox = {} - - if bbox.get('width', 0) <= 0: - bbox = {} - - label = node.get('label', '') - return { - 'text': node.get('value', '') or label, - 'resource_id': node.get('name', ''), - 'class_name': node.get('type', ''), - 'accessibility_label': label, - 'label': label, # iOS-specific - 'clickable': node.get('enabled', 'false') == 'true', - 'enabled': node.get('enabled', 'false') == 'true', - 'bbox': bbox, - } - - def _parse_android_bounds(self, bounds_str: str) -> Dict[str, int]: - """ - Parse Android bounds string to bbox dict. - - Format: "[0,72][1080,200]" -> {x, y, width, height} - """ - if not bounds_str: - return {} - try: - parts = bounds_str.replace('][', ',').strip('[]').split(',') - if len(parts) == 4: - x1, y1, x2, y2 = map(int, parts) - return {'x': x1, 'y': y1, 'width': x2 - x1, 'height': y2 - y1} - except (ValueError, AttributeError): - pass - return {} - From 9990cf0b9a9219d4d7b0d4210f4bcba4ccb82ba1 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Mon, 29 Dec 2025 15:02:36 +0100 Subject: [PATCH 03/19] building locator - android --- Agent/platforms/locators/__init__.py | 8 +- Agent/platforms/locators/android_locator.py | 102 ++++++++++++++++++++ Agent/platforms/locators/ios_locator.py | 15 +++ Agent/platforms/locators/mobile.py | 83 +++------------- 4 files changed, 140 insertions(+), 68 deletions(-) create mode 100644 Agent/platforms/locators/android_locator.py create mode 100644 Agent/platforms/locators/ios_locator.py diff --git a/Agent/platforms/locators/__init__.py b/Agent/platforms/locators/__init__.py index 2bf4991..5b8e665 100644 --- a/Agent/platforms/locators/__init__.py +++ b/Agent/platforms/locators/__init__.py @@ -1,3 +1,9 @@ from Agent.platforms.locators.mobile import MobileLocatorBuilder +from Agent.platforms.locators.android_locator import AndroidLocatorBuilder +from Agent.platforms.locators.ios_locator import IOSLocatorBuilder -__all__ = ["MobileLocatorBuilder"] +__all__ = [ + "MobileLocatorBuilder", + "AndroidLocatorBuilder", + "IOSLocatorBuilder", +] diff --git a/Agent/platforms/locators/android_locator.py b/Agent/platforms/locators/android_locator.py new file mode 100644 index 0000000..c034bc6 --- /dev/null +++ b/Agent/platforms/locators/android_locator.py @@ -0,0 +1,102 @@ +from typing import Any, Dict + + +class AndroidLocatorBuilder: + """Builds Appium locators for Android elements.""" + + def build(self, element: Dict[str, Any], robust: bool = False) -> str: + if robust: + return self.build_robust(element) + return self.build_priority(element) + + def build_priority(self, element: Dict[str, Any]) -> str: + """ + Args: + element: Dict with raw XML attributes + Returns: + First available locator: id > accessibility_id > text > class + """ + resource_id = self._get_str(element, 'resource-id') + if resource_id: + return f"id={resource_id}" + + content_desc = self._get_str(element, 'content-desc') + if content_desc: + return f"accessibility_id={content_desc}" + + text = self._get_str(element, 'text') + if text: + return f"//*[@text={self._escape_xpath(text)}]" + + class_name = self._get_str(element, 'class') + if class_name: + return f"class={class_name}" + + raise AssertionError("Cannot build locator: no usable attributes") + + def build_robust(self, element: Dict[str, Any]) -> str: + """ + Args: + element: Dict with raw XML attributes + Returns: + XPath combining all available attributes for uniqueness + """ + conditions = [] + + resource_id = self._get_str(element, 'resource-id') + if resource_id: + conditions.append(f"@resource-id={self._escape_xpath(resource_id)}") + + content_desc = self._get_str(element, 'content-desc') + if content_desc: + conditions.append(f"@content-desc={self._escape_xpath(content_desc)}") + + text = self._get_str(element, 'text') + if text: + conditions.append(f"@text={self._escape_xpath(text)}") + + bounds = self._get_str(element, 'bounds') + if bounds: + conditions.append(f"@bounds='{bounds}'") + + class_name = self._get_str(element, 'class') + base = f"//{class_name}" if class_name else "//*" + + if not conditions: + if class_name: + return base + raise AssertionError("Cannot build locator: no usable attributes") + + return f"{base}[{' and '.join(conditions)}]" + + def _get_str(self, element: Dict[str, Any], key: str) -> str: + val = element.get(key, '') + return str(val).strip() if val else '' + + def _escape_xpath(self, value: str) -> str: + """ + Args: + value: "It's a test" + Returns: + concat('It', \"'\", 's a test') or 'simple' + """ + if "'" not in value: + return f"'{value}'" + if '"' not in value: + return f'"{value}"' + + parts = [] + current = "" + for char in value: + if char == "'": + if current: + parts.append(f"'{current}'") + current = "" + parts.append("\"'\"") + else: + current += char + if current: + parts.append(f"'{current}'") + + return f"concat({', '.join(parts)})" + diff --git a/Agent/platforms/locators/ios_locator.py b/Agent/platforms/locators/ios_locator.py new file mode 100644 index 0000000..9c97f37 --- /dev/null +++ b/Agent/platforms/locators/ios_locator.py @@ -0,0 +1,15 @@ +from typing import Any, Dict + + +class IOSLocatorBuilder: + """Builds Appium locators for iOS elements.""" + + def build(self, element: Dict[str, Any], robust: bool = False) -> str: + raise NotImplementedError("iOS locator builder not implemented yet") + + def build_priority(self, element: Dict[str, Any]) -> str: + raise NotImplementedError("iOS locator builder not implemented yet") + + def build_robust(self, element: Dict[str, Any]) -> str: + raise NotImplementedError("iOS locator builder not implemented yet") + diff --git a/Agent/platforms/locators/mobile.py b/Agent/platforms/locators/mobile.py index a6fc8d3..c391849 100644 --- a/Agent/platforms/locators/mobile.py +++ b/Agent/platforms/locators/mobile.py @@ -1,81 +1,30 @@ from typing import Any, Dict +from Agent.platforms.locators.android_locator import AndroidLocatorBuilder +from Agent.platforms.locators.ios_locator import IOSLocatorBuilder class MobileLocatorBuilder: - """Builds Appium locators for Android and iOS.""" + """Facade that dispatches to platform-specific locator builders.""" def __init__(self, platform: str = "android"): self._platform = platform + self._android_builder = AndroidLocatorBuilder() + self._ios_builder = IOSLocatorBuilder() def set_platform(self, platform: str) -> None: self._platform = platform - def build(self, element: Dict[str, Any]) -> str: - """Dispatch to platform-specific method.""" + def build(self, element: Dict[str, Any], robust: bool = False) -> str: if self._platform == "ios": - return self.build_ios(element) - return self.build_android(element) + return self._ios_builder.build(element, robust=robust) + return self._android_builder.build(element, robust=robust) - def build_android(self, element: Dict[str, Any]) -> str: - """ - Build XPath locator combining all available attributes. - - Returns: "//*[@resource-id='x' and @content-desc='y' and @text='z']" - """ - resource_id = element.get('resource_id', '').strip() - acc_label = element.get('accessibility_label', '') or element.get('content_desc', '') - acc_label = acc_label.strip() if acc_label else '' - text = element.get('text', '').strip() - class_name = element.get('class_name', '').strip() - - conditions = [] - - if resource_id: - conditions.append(f"@resource-id='{resource_id}'") - - if acc_label: - conditions.append(f"@content-desc='{acc_label}'") - - if text: - conditions.append(f"@text='{text}'") - - if not conditions: - if class_name: - return f"//{class_name}" - raise AssertionError("Cannot build locator: element has no usable attributes") - - base = f"//{class_name}" if class_name else "//*" - return f"{base}[{' and '.join(conditions)}]" + def build_priority(self, element: Dict[str, Any]) -> str: + if self._platform == "ios": + return self._ios_builder.build_priority(element) + return self._android_builder.build_priority(element) - def build_ios(self, element: Dict[str, Any]) -> str: - """ - Build iOS predicate string combining all available attributes. - - Returns: "-ios predicate string:name == 'x' AND label == 'y'" - """ - resource_id = element.get('resource_id', '').strip() - acc_label = element.get('accessibility_label', '') or element.get('label', '') - acc_label = acc_label.strip() if acc_label else '' - text = element.get('text', '').strip() - class_name = element.get('class_name', '').strip() - - conditions = [] - - if resource_id: - conditions.append(f"name == '{resource_id}'") - - if acc_label: - escaped = acc_label.replace("'", "\\'") - conditions.append(f"label == '{escaped}'") - - if text: - escaped = text.replace("'", "\\'") - conditions.append(f"value == '{escaped}'") - - if class_name: - conditions.append(f"type == '{class_name}'") - - if not conditions: - raise AssertionError("Cannot build locator: element has no usable attributes") - - return f"-ios predicate string:{' AND '.join(conditions)}" + def build_robust(self, element: Dict[str, Any]) -> str: + if self._platform == "ios": + return self._ios_builder.build_robust(element) + return self._android_builder.build_robust(element) From c934b4b4d1683852020b6943287d8fb2c7ee8013 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Mon, 29 Dec 2025 15:03:00 +0100 Subject: [PATCH 04/19] adapting connector filtering and collecting element changes --- Agent/platforms/__init__.py | 10 +++++--- Agent/platforms/_mobileconnector.py | 40 +++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/Agent/platforms/__init__.py b/Agent/platforms/__init__.py index 25a24a8..6521cc0 100644 --- a/Agent/platforms/__init__.py +++ b/Agent/platforms/__init__.py @@ -1,9 +1,8 @@ from Agent.platforms._mobileconnector import DeviceConnector from Agent.platforms._platformfactory import create_platform -from Agent.platforms.locators import MobileLocatorBuilder -from Agent.platforms.collectors import XMLCollector +from Agent.platforms.locators import MobileLocatorBuilder, AndroidLocatorBuilder, IOSLocatorBuilder +from Agent.platforms.collectors import AndroidCollector, IOSCollector -# Placeholder for future web support WebConnectorRF = None __all__ = [ @@ -11,5 +10,8 @@ "WebConnectorRF", "create_platform", "MobileLocatorBuilder", - "XMLCollector", + "AndroidLocatorBuilder", + "IOSLocatorBuilder", + "AndroidCollector", + "IOSCollector", ] diff --git a/Agent/platforms/_mobileconnector.py b/Agent/platforms/_mobileconnector.py index bc541a7..439b0f4 100644 --- a/Agent/platforms/_mobileconnector.py +++ b/Agent/platforms/_mobileconnector.py @@ -1,8 +1,10 @@ from typing import Any, Dict, List from robot.api import logger from robot.libraries.BuiltIn import BuiltIn -from Agent.platforms.collectors.xml_collector import XMLCollector +from Agent.platforms.collectors.android_collector import AndroidCollector +from Agent.platforms.collectors.ios_collector import IOSCollector from Agent.platforms.locators.mobile import MobileLocatorBuilder +from Agent.platforms.filters.android import AndroidFilterPipeline from Agent.ai.prompts.renderer import UIRenderer @@ -13,12 +15,12 @@ def __init__(self): self._appium_lib = None self._driver = None self._session_id = None - self._collector = XMLCollector() + self._android_collector = AndroidCollector() + self._ios_collector = IOSCollector() self.locator_builder = MobileLocatorBuilder() self._renderer = UIRenderer() def _get_driver(self) -> Any: - """Get Appium driver instance.""" if self._appium_lib is None: self._appium_lib = BuiltIn().get_library_instance('AppiumLibrary') @@ -54,29 +56,45 @@ def _get_driver(self) -> Any: return self._driver def get_platform(self) -> str: - """Detect platform from driver capabilities.""" caps = self._get_driver().capabilities platform = caps.get('platformName', '').lower() return 'ios' if 'ios' in platform else 'android' + def get_screen_size(self) -> Dict[str, int]: + size = self._get_driver().get_window_size() + return {'width': size.get('width', 0), 'height': size.get('height', 0)} + def get_ui_xml(self) -> str: return self._get_driver().page_source def collect_ui_candidates(self, max_items: int = 50) -> List[Dict[str, Any]]: - """Collect interactive UI elements from current screen.""" xml = self.get_ui_xml() platform = self.get_platform() - self._collector.set_platform(platform) - return self._collector.parse_xml(xml, max_items=max_items) + + if platform == 'ios': + raise NotImplementedError("iOS not implemented yet") + + elements = self._android_collector.parse_xml(xml) + pipeline = AndroidFilterPipeline(self.get_screen_size()) + filtered = pipeline.apply(elements) + + return filtered[:max_items] + + def collect_all_elements(self) -> List[Dict[str, Any]]: + xml = self.get_ui_xml() + platform = self.get_platform() + + if platform == 'ios': + raise NotImplementedError("iOS not implemented yet") + + return self._android_collector.parse_xml(xml) - def build_locator_from_element(self, element: Dict[str, Any]) -> str: - """Build Appium locator from element attributes.""" + def build_locator_from_element(self, element: Dict[str, Any], robust: bool = False) -> str: platform = self.get_platform() self.locator_builder.set_platform(platform) - return self.locator_builder.build(element) + return self.locator_builder.build(element, robust=robust) def render_ui_for_prompt(self, ui_elements: List[Dict[str, Any]]) -> str: - """Render UI elements as text for AI prompt.""" platform = self.get_platform() return self._renderer.render(ui_elements, platform=platform) From f60dd23ad7c706fb4d02510b1240a7b05d0e607d Mon Sep 17 00:00:00 2001 From: hassineabd Date: Mon, 29 Dec 2025 19:41:46 +0100 Subject: [PATCH 05/19] fix small element annotation --- Agent/platforms/collectors/som_renderer.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Agent/platforms/collectors/som_renderer.py b/Agent/platforms/collectors/som_renderer.py index ac7a33a..702b471 100644 --- a/Agent/platforms/collectors/som_renderer.py +++ b/Agent/platforms/collectors/som_renderer.py @@ -67,13 +67,25 @@ def render_som( source = element.get(source_key, "dom") color = COLOR_DOM if source == "dom" else COLOR_OMNIPARSER if source == "omniparser" else COLOR_DEFAULT - # Apply margin to create visual spacing between boxes + # Apply margin to create visual spacing, but ensure box remains valid margin = 4 + # Adjust margin if element is too small + if w <= 2 * margin: + margin = max(0, w // 2 - 1) + if h <= 2 * margin: + margin = min(margin, max(0, h // 2 - 1)) + box_x1 = x + margin box_y1 = y + margin box_x2 = x + w - margin box_y2 = y + h - margin + # Ensure valid coordinates (x2 > x1, y2 > y1) + if box_x2 <= box_x1: + box_x2 = box_x1 + 1 + if box_y2 <= box_y1: + box_y2 = box_y1 + 1 + # Draw box with transparency draw.rectangle( [box_x1, box_y1, box_x2, box_y2], From 06670c922eb2247162847c025087880a98137e2d Mon Sep 17 00:00:00 2001 From: hassineabd Date: Mon, 29 Dec 2025 23:15:27 +0100 Subject: [PATCH 06/19] android locator builder --- Agent/platforms/locators/android_locator.py | 111 +++++++++++++++----- 1 file changed, 87 insertions(+), 24 deletions(-) diff --git a/Agent/platforms/locators/android_locator.py b/Agent/platforms/locators/android_locator.py index c034bc6..3f5ea53 100644 --- a/Agent/platforms/locators/android_locator.py +++ b/Agent/platforms/locators/android_locator.py @@ -4,17 +4,17 @@ class AndroidLocatorBuilder: """Builds Appium locators for Android elements.""" - def build(self, element: Dict[str, Any], robust: bool = False) -> str: - if robust: - return self.build_robust(element) - return self.build_priority(element) + def build(self, element: Dict[str, Any], id_only: bool = False) -> str: + if id_only: + return self.build_identifiers_only(element) + return self.build_locator_unique_content(element) - def build_priority(self, element: Dict[str, Any]) -> str: + def build_locator_unique_content(self, element: Dict[str, Any]) -> str: """ Args: element: Dict with raw XML attributes Returns: - First available locator: id > accessibility_id > text > class + Unique content: resource-id > content-desc > text """ resource_id = self._get_str(element, 'resource-id') if resource_id: @@ -28,44 +28,107 @@ def build_priority(self, element: Dict[str, Any]) -> str: if text: return f"//*[@text={self._escape_xpath(text)}]" - class_name = self._get_str(element, 'class') - if class_name: - return f"class={class_name}" - raise AssertionError("Cannot build locator: no usable attributes") - def build_robust(self, element: Dict[str, Any]) -> str: + def build_identifiers_only(self, element: Dict[str, Any]) -> str: """ Args: element: Dict with raw XML attributes Returns: - XPath combining all available attributes for uniqueness + Identifiers only: resource-id > content-desc, raise if none + Example: 'id=com.android:id/button' or 'accessibility_id=Navigate up' """ - conditions = [] + content_desc = self._get_str(element, 'content-desc') + if content_desc: + return f"accessibility_id={content_desc}" resource_id = self._get_str(element, 'resource-id') if resource_id: - conditions.append(f"@resource-id={self._escape_xpath(resource_id)}") + return f"id={resource_id}" - content_desc = self._get_str(element, 'content-desc') - if content_desc: - conditions.append(f"@content-desc={self._escape_xpath(content_desc)}") + raise ValueError("No ID attributes available") + + def build_by_bounds(self, element: Dict[str, Any]) -> str: + """ + Args: + element: Dict with raw XML attributes + Returns: + XPath with bounds attribute + Example: '//*[@bounds="[0,72][1080,200]"]' + """ + bounds = self._get_str(element, 'bounds') + if not bounds: + raise ValueError("No bounds attribute available") - text = self._get_str(element, 'text') - if text: - conditions.append(f"@text={self._escape_xpath(text)}") + class_name = self._get_str(element, 'class') + base = f"//{class_name}" if class_name else "//*" - bounds = self._get_str(element, 'bounds') - if bounds: - conditions.append(f"@bounds='{bounds}'") + return f"{base}[@bounds='{bounds}']" + + def build_xpath_attributes(self, element: Dict[str, Any]) -> str: + """ + Args: + element: Dict with raw XML attributes + Returns: + XPath with content attributes (resource-id, content-desc, text) + Example: '//Button[@resource-id="btn" and @text="Login"]' + """ + return self._build_full_xpath(element, exclude_metadata=True) + + def build_xpath_all(self, element: Dict[str, Any]) -> str: + """ + Args: + element: Dict with raw XML attributes + Returns: + XPath with ALL attributes including metadata (clickable, enabled, etc.) + Example: '//Button[@resource-id="btn" and @clickable="true"]' + """ + return self._build_full_xpath(element, exclude_metadata=False) + + def _build_full_xpath( + self, + element: Dict[str, Any], + exclude_metadata: bool = True + ) -> str: + """ + Args: + element: Dict with raw XML attributes + exclude_metadata: If True, exclude only computed (bbox, elementId, package) + If False, also exclude bool/numeric values (except bounds) + Returns: + XPath combining selected attributes dynamically + """ + excluded_base = {'bbox', 'elementId', 'package'} + conditions = [] class_name = self._get_str(element, 'class') + + for key, value in element.items(): + if key == 'class': + continue + + if key in excluded_base: + continue + + val_str = str(value).strip() if value else '' + if not val_str: + continue + + if not exclude_metadata: + if key != 'bounds': + if val_str in ('true', 'false'): + continue + if val_str.isdigit(): + continue + + conditions.append(f"@{key}={self._escape_xpath(val_str)}") + base = f"//{class_name}" if class_name else "//*" if not conditions: if class_name: return base - raise AssertionError("Cannot build locator: no usable attributes") + raise ValueError("No attributes available") return f"{base}[{' and '.join(conditions)}]" From e221e3fccd0f1482120d07cb2c9a68233463c3f3 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 00:39:45 +0100 Subject: [PATCH 07/19] added global building locator with strategy type --- Agent/platforms/locators/android_locator.py | 26 ++++++++++---- Agent/platforms/locators/ios_locator.py | 13 +++++-- Agent/platforms/locators/mobile.py | 30 ---------------- Agent/platforms/locators/mobile_locator.py | 38 +++++++++++++++++++++ 4 files changed, 67 insertions(+), 40 deletions(-) delete mode 100644 Agent/platforms/locators/mobile.py create mode 100644 Agent/platforms/locators/mobile_locator.py diff --git a/Agent/platforms/locators/android_locator.py b/Agent/platforms/locators/android_locator.py index 3f5ea53..6affa63 100644 --- a/Agent/platforms/locators/android_locator.py +++ b/Agent/platforms/locators/android_locator.py @@ -4,18 +4,29 @@ class AndroidLocatorBuilder: """Builds Appium locators for Android elements.""" - def build(self, element: Dict[str, Any], id_only: bool = False) -> str: - if id_only: - return self.build_identifiers_only(element) - return self.build_locator_unique_content(element) - - def build_locator_unique_content(self, element: Dict[str, Any]) -> str: + def build(self, element: Dict[str, Any], strategy: str = 'auto') -> str: """ Args: element: Dict with raw XML attributes + strategy: 'auto' | 'id_only' | 'bounds' | 'xpath_attrs' | 'xpath_all' Returns: - Unique content: resource-id > content-desc > text + Appium locator string + Example: build(elem, 'id_only') -> 'id=com.android:id/button' """ + if strategy == 'auto': + return self._build_locator_unique_content(element) + elif strategy == 'id_only': + return self.build_identifiers_only(element) + elif strategy == 'bounds': + return self.build_by_bounds(element) + elif strategy == 'xpath_attrs': + return self.build_xpath_attributes(element) + elif strategy == 'xpath_all': + return self.build_xpath_all(element) + else: + raise ValueError(f"Unknown strategy: {strategy}") + + def _build_locator_unique_content(self, element: Dict[str, Any]) -> str: resource_id = self._get_str(element, 'resource-id') if resource_id: return f"id={resource_id}" @@ -30,6 +41,7 @@ def build_locator_unique_content(self, element: Dict[str, Any]) -> str: raise AssertionError("Cannot build locator: no usable attributes") + #TODO: see if this should be private after adding locator strategies ( build ) def build_identifiers_only(self, element: Dict[str, Any]) -> str: """ Args: diff --git a/Agent/platforms/locators/ios_locator.py b/Agent/platforms/locators/ios_locator.py index 9c97f37..307940b 100644 --- a/Agent/platforms/locators/ios_locator.py +++ b/Agent/platforms/locators/ios_locator.py @@ -4,12 +4,19 @@ class IOSLocatorBuilder: """Builds Appium locators for iOS elements.""" - def build(self, element: Dict[str, Any], robust: bool = False) -> str: + def build(self, element: Dict[str, Any], strategy: str = 'auto') -> str: raise NotImplementedError("iOS locator builder not implemented yet") - def build_priority(self, element: Dict[str, Any]) -> str: + #TODO: see if this should be private after adding locator strategies ( build ) + def build_identifiers_only(self, element: Dict[str, Any]) -> str: raise NotImplementedError("iOS locator builder not implemented yet") - def build_robust(self, element: Dict[str, Any]) -> str: + def build_by_bounds(self, element: Dict[str, Any]) -> str: + raise NotImplementedError("iOS locator builder not implemented yet") + + def build_xpath_attributes(self, element: Dict[str, Any]) -> str: + raise NotImplementedError("iOS locator builder not implemented yet") + + def build_xpath_all(self, element: Dict[str, Any]) -> str: raise NotImplementedError("iOS locator builder not implemented yet") diff --git a/Agent/platforms/locators/mobile.py b/Agent/platforms/locators/mobile.py deleted file mode 100644 index c391849..0000000 --- a/Agent/platforms/locators/mobile.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Any, Dict -from Agent.platforms.locators.android_locator import AndroidLocatorBuilder -from Agent.platforms.locators.ios_locator import IOSLocatorBuilder - - -class MobileLocatorBuilder: - """Facade that dispatches to platform-specific locator builders.""" - - def __init__(self, platform: str = "android"): - self._platform = platform - self._android_builder = AndroidLocatorBuilder() - self._ios_builder = IOSLocatorBuilder() - - def set_platform(self, platform: str) -> None: - self._platform = platform - - def build(self, element: Dict[str, Any], robust: bool = False) -> str: - if self._platform == "ios": - return self._ios_builder.build(element, robust=robust) - return self._android_builder.build(element, robust=robust) - - def build_priority(self, element: Dict[str, Any]) -> str: - if self._platform == "ios": - return self._ios_builder.build_priority(element) - return self._android_builder.build_priority(element) - - def build_robust(self, element: Dict[str, Any]) -> str: - if self._platform == "ios": - return self._ios_builder.build_robust(element) - return self._android_builder.build_robust(element) diff --git a/Agent/platforms/locators/mobile_locator.py b/Agent/platforms/locators/mobile_locator.py new file mode 100644 index 0000000..9a08c5b --- /dev/null +++ b/Agent/platforms/locators/mobile_locator.py @@ -0,0 +1,38 @@ +from typing import Any, Dict, Literal + +StrategyType = Literal['auto', 'id_only', 'bounds', 'xpath_attrs', 'xpath_all'] + + +class MobileLocatorBuilder: + """Facade that dispatches to platform-specific locator builders with lazy init and flexible platform.""" + + def __init__(self, platform: str = None): + self._platform = platform + self._builder = None + + def set_platform(self, platform: str): + if self._platform != platform: + self._platform = platform + self._builder = None + + def _get_builder(self): + if self._builder is None: + if self._platform == 'ios': + from Agent.platforms.locators.ios_locator import IOSLocatorBuilder + self._builder = IOSLocatorBuilder() + else: + from Agent.platforms.locators.android_locator import AndroidLocatorBuilder + self._builder = AndroidLocatorBuilder() + return self._builder + + def build(self, element: Dict[str, Any], strategy: StrategyType = 'auto') -> str: + """ + Args: + element: Dict with raw XML attributes + strategy: 'auto' | 'id_only' | 'bounds' | 'xpath_attrs' | 'xpath_all' + Returns: + Appium locator string + Example: build(elem, 'id_only') -> 'id=com.android:id/button' + """ + return self._get_builder().build(element, strategy=strategy) + From 44b8b690776dd30f48edcfba7861a0a2c517a949 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 00:40:09 +0100 Subject: [PATCH 08/19] expose single entry for locators building --- Agent/platforms/locators/__init__.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Agent/platforms/locators/__init__.py b/Agent/platforms/locators/__init__.py index 5b8e665..985b634 100644 --- a/Agent/platforms/locators/__init__.py +++ b/Agent/platforms/locators/__init__.py @@ -1,9 +1,3 @@ -from Agent.platforms.locators.mobile import MobileLocatorBuilder -from Agent.platforms.locators.android_locator import AndroidLocatorBuilder -from Agent.platforms.locators.ios_locator import IOSLocatorBuilder +from Agent.platforms.locators.mobile_locator import MobileLocatorBuilder -__all__ = [ - "MobileLocatorBuilder", - "AndroidLocatorBuilder", - "IOSLocatorBuilder", -] +__all__ = ["MobileLocatorBuilder"] From 3ae942acfcacf2a18046640f8b32778f4d0384c6 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 00:56:59 +0100 Subject: [PATCH 09/19] added filtering methods --- Agent/platforms/filters/android/__init__.py | 10 + Agent/platforms/filters/android/container.py | 67 +++++ .../platforms/filters/android/interactive.py | 10 +- .../filters/android/smart_hierarchy.py | 228 ++++++++++++++++++ 4 files changed, 311 insertions(+), 4 deletions(-) create mode 100644 Agent/platforms/filters/android/container.py create mode 100644 Agent/platforms/filters/android/smart_hierarchy.py diff --git a/Agent/platforms/filters/android/__init__.py b/Agent/platforms/filters/android/__init__.py index 1928e9d..0239554 100644 --- a/Agent/platforms/filters/android/__init__.py +++ b/Agent/platforms/filters/android/__init__.py @@ -2,6 +2,8 @@ from Agent.platforms.filters.android.displayed import DisplayedFilter from Agent.platforms.filters.android.bounds import BoundsFilter from Agent.platforms.filters.android.interactive import InteractiveFilter +from Agent.platforms.filters.android.smart_hierarchy import SmartHierarchyFilter +from Agent.platforms.filters.android.container import ContainerFilter from Agent.platforms.filters.pipeline import FilterPipeline @@ -14,6 +16,12 @@ def __init__(self, screen_size: Dict[str, int] = None): DisplayedFilter(), BoundsFilter(screen_size.get('width', 0), screen_size.get('height', 0)), InteractiveFilter(), + SmartHierarchyFilter( + prefer_parent_when_clickable=True, + min_relevance_score=5, + overlap_threshold=0.9 + ), + ContainerFilter(), ]) def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: @@ -24,6 +32,8 @@ def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 'DisplayedFilter', 'BoundsFilter', 'InteractiveFilter', + 'SmartHierarchyFilter', + 'ContainerFilter', 'AndroidFilterPipeline', ] diff --git a/Agent/platforms/filters/android/container.py b/Agent/platforms/filters/android/container.py new file mode 100644 index 0000000..c1c1dc6 --- /dev/null +++ b/Agent/platforms/filters/android/container.py @@ -0,0 +1,67 @@ +from typing import Any, Dict, List, Set + + +class ContainerFilter: + """Remove containers that have interactive children in the list""" + + CONTAINER_CLASSES = { + 'RecyclerView', 'ScrollView', 'HorizontalScrollView', + 'LinearLayout', 'RelativeLayout', 'FrameLayout', + 'ViewGroup', 'ViewPager', 'ConstraintLayout', 'CoordinatorLayout' + } + + def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Args: + elements: List of interactive elements + Returns: + List without containers that have children + """ + result = [] + + for elem in elements: + if not self._is_container(elem): + result.append(elem) + continue + + if not self._has_interactive_children(elem, elements): + result.append(elem) + + return result + + def _is_container(self, elem: Dict[str, Any]) -> bool: + """Check if element is a layout container""" + class_name = elem.get('class', '') + return any(c in class_name for c in self.CONTAINER_CLASSES) + + def _has_interactive_children(self, container, all_elements): + """Check if container has children in the element list""" + container_bbox = container.get('bbox', {}) + if not container_bbox: + return False + + cx = container_bbox.get('x', 0) + cy = container_bbox.get('y', 0) + cw = container_bbox.get('width', 0) + ch = container_bbox.get('height', 0) + + for other in all_elements: + if other is container: + continue + + other_bbox = other.get('bbox', {}) + if not other_bbox: + continue + + ox = other_bbox.get('x', 0) + oy = other_bbox.get('y', 0) + ow = other_bbox.get('width', 0) + oh = other_bbox.get('height', 0) + + # If other is contained in container + if (ox >= cx and oy >= cy and + ox + ow <= cx + cw and oy + oh <= cy + ch): + return True + + return False + diff --git a/Agent/platforms/filters/android/interactive.py b/Agent/platforms/filters/android/interactive.py index 291269e..c68b959 100644 --- a/Agent/platforms/filters/android/interactive.py +++ b/Agent/platforms/filters/android/interactive.py @@ -29,10 +29,12 @@ def _is_interactive(self, e: Dict[str, Any]) -> bool: if content_desc and str(content_desc).strip(): return True + resource_id = e.get('resource-id', '').strip() class_name = e.get('class', '') - for interactive_class in self.INTERACTIVE_CLASSES: - if interactive_class in class_name: - return True + + if resource_id: + for interactive_class in self.INTERACTIVE_CLASSES: + if interactive_class in class_name: + return True return False - diff --git a/Agent/platforms/filters/android/smart_hierarchy.py b/Agent/platforms/filters/android/smart_hierarchy.py new file mode 100644 index 0000000..d370be6 --- /dev/null +++ b/Agent/platforms/filters/android/smart_hierarchy.py @@ -0,0 +1,228 @@ +from typing import Any, Dict, List, Set, Optional + + +class SmartHierarchyFilter: + """ + Args: + prefer_parent_when_clickable: Keep parent if clickable, ignore children + min_relevance_score: Min score to keep element + overlap_threshold: Min overlap ratio (0-1) to group elements + Example: SmartHierarchyFilter(prefer_parent_when_clickable=True, min_relevance_score=5) + """ + + CONTAINER_CLASSES = { + 'RecyclerView', 'ScrollView', 'HorizontalScrollView', + 'LinearLayout', 'RelativeLayout', 'FrameLayout', + 'ViewGroup', 'ViewPager', 'ConstraintLayout' + } + + def __init__( + self, + prefer_parent_when_clickable: bool = True, + min_relevance_score: int = 0, + overlap_threshold: float = 0.9 + ): + self._prefer_parent_when_clickable = prefer_parent_when_clickable + self._min_relevance_score = min_relevance_score + self._overlap_threshold = overlap_threshold + + def apply(self, elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Args: + elements: List of elements + Returns: + List with one element per overlapping group + """ + if not elements: + return [] + + groups = self._build_overlap_groups(elements) + selected_indices = set() + + for group_indices in groups: + best_idx = self._select_best_from_group(elements, group_indices) + if best_idx is not None: + selected_indices.add(best_idx) + + return [elements[i] for i in sorted(selected_indices)] + + def _build_overlap_groups(self, elements: List[Dict[str, Any]]) -> List[List[int]]: + """ + Args: + elements: List of elements + Returns: + List of groups, each group is list of overlapping element indices + """ + n = len(elements) + visited = [False] * n + groups = [] + + for i in range(n): + if visited[i]: + continue + + elem_i = elements[i] + bbox_i = elem_i.get('bbox', {}) + if not bbox_i: + groups.append([i]) + visited[i] = True + continue + + group = [i] + visited[i] = True + + for j in range(i + 1, n): + if visited[j]: + continue + + elem_j = elements[j] + bbox_j = elem_j.get('bbox', {}) + if not bbox_j: + continue + + if self._should_group(elem_i, elem_j, bbox_i, bbox_j): + group.append(j) + visited[j] = True + + groups.append(group) + + return groups + + def _should_group( + self, + elem1: Dict[str, Any], + elem2: Dict[str, Any], + bbox1: Dict[str, int], + bbox2: Dict[str, int] + ) -> bool: + """Check if two elements should be grouped together""" + if self._is_container(elem1) or self._is_container(elem2): + return False + + if not self._is_interactive(elem1) and not self._is_interactive(elem2): + return False + + return self._has_significant_overlap(bbox1, bbox2) + + def _is_container(self, elem: Dict[str, Any]) -> bool: + """Check if element is a layout container""" + class_name = elem.get('class', '') + return any(c in class_name for c in self.CONTAINER_CLASSES) + + def _has_significant_overlap( + self, + bbox1: Dict[str, int], + bbox2: Dict[str, int] + ) -> bool: + """Check if two bboxes overlap significantly""" + x1 = bbox1.get('x', 0) + y1 = bbox1.get('y', 0) + w1 = bbox1.get('width', 0) + h1 = bbox1.get('height', 0) + + x2 = bbox2.get('x', 0) + y2 = bbox2.get('y', 0) + w2 = bbox2.get('width', 0) + h2 = bbox2.get('height', 0) + + x_left = max(x1, x2) + y_top = max(y1, y2) + x_right = min(x1 + w1, x2 + w2) + y_bottom = min(y1 + h1, y2 + h2) + + if x_right < x_left or y_bottom < y_top: + return False + + intersection = (x_right - x_left) * (y_bottom - y_top) + area1 = w1 * h1 + area2 = w2 * h2 + + if area1 == 0 or area2 == 0: + return False + + overlap_ratio = max(intersection / area1, intersection / area2) + return overlap_ratio > self._overlap_threshold + + def _select_best_from_group( + self, + elements: List[Dict[str, Any]], + group_indices: List[int] + ) -> Optional[int]: + """ + Args: + elements: All elements + group_indices: Indices of parent and children + Returns: + Index of best element to keep + """ + if not group_indices: + return None + + parent_idx = group_indices[0] + parent = elements[parent_idx] + + if self._prefer_parent_when_clickable: + if self._is_interactive(parent): + return parent_idx + + best_idx = parent_idx + best_score = self._get_relevance_score(parent) + + for idx in group_indices: + elem = elements[idx] + score = self._get_relevance_score(elem) + + if score > best_score: + best_score = score + best_idx = idx + + if best_score >= self._min_relevance_score: + return best_idx + + return None + + def _is_interactive(self, elem: Dict[str, Any]) -> bool: + """Check if element is interactive""" + return ( + elem.get('clickable') == 'true' or + elem.get('focusable') == 'true' or + elem.get('long-clickable') == 'true' + ) + + def _get_relevance_score(self, elem: Dict[str, Any]) -> int: + """ + Args: + elem: Element dict + Returns: + Relevance score (higher = more relevant) + """ + score = 0 + + if elem.get('clickable') == 'true': + score += 20 + if elem.get('focusable') == 'true': + score += 15 + + class_name = elem.get('class', '') + if 'Button' in class_name or 'EditText' in class_name or 'ImageButton' in class_name: + score += 10 + + if elem.get('text', '').strip(): + score += 8 + if elem.get('content-desc', '').strip(): + score += 6 + if elem.get('resource-id', '').strip(): + score += 4 + + if 'Layout' in class_name or 'ViewGroup' in class_name or 'FrameLayout' in class_name: + score -= 10 + + bbox = elem.get('bbox', {}) + if bbox: + width = bbox.get('width', 0) + height = bbox.get('height', 0) + if width < 10 or height < 10: + score -= 5 + + return score + From 3bda9b26aee9ac90f389fa37e96127e69eef2f4d Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 00:57:44 +0100 Subject: [PATCH 10/19] building locator from element --- Agent/platforms/__init__.py | 4 +- Agent/platforms/_mobileconnector.py | 101 ++++++++++++++++++++-------- 2 files changed, 73 insertions(+), 32 deletions(-) diff --git a/Agent/platforms/__init__.py b/Agent/platforms/__init__.py index 6521cc0..264ae37 100644 --- a/Agent/platforms/__init__.py +++ b/Agent/platforms/__init__.py @@ -1,6 +1,6 @@ from Agent.platforms._mobileconnector import DeviceConnector from Agent.platforms._platformfactory import create_platform -from Agent.platforms.locators import MobileLocatorBuilder, AndroidLocatorBuilder, IOSLocatorBuilder +from Agent.platforms.locators import MobileLocatorBuilder from Agent.platforms.collectors import AndroidCollector, IOSCollector WebConnectorRF = None @@ -10,8 +10,6 @@ "WebConnectorRF", "create_platform", "MobileLocatorBuilder", - "AndroidLocatorBuilder", - "IOSLocatorBuilder", "AndroidCollector", "IOSCollector", ] diff --git a/Agent/platforms/_mobileconnector.py b/Agent/platforms/_mobileconnector.py index 439b0f4..c81fee3 100644 --- a/Agent/platforms/_mobileconnector.py +++ b/Agent/platforms/_mobileconnector.py @@ -1,11 +1,7 @@ from typing import Any, Dict, List from robot.api import logger from robot.libraries.BuiltIn import BuiltIn -from Agent.platforms.collectors.android_collector import AndroidCollector -from Agent.platforms.collectors.ios_collector import IOSCollector -from Agent.platforms.locators.mobile import MobileLocatorBuilder -from Agent.platforms.filters.android import AndroidFilterPipeline -from Agent.ai.prompts.renderer import UIRenderer +# Lazy import in methods for collectors, renderer, and locator builder class DeviceConnector: @@ -15,10 +11,11 @@ def __init__(self): self._appium_lib = None self._driver = None self._session_id = None - self._android_collector = AndroidCollector() - self._ios_collector = IOSCollector() - self.locator_builder = MobileLocatorBuilder() - self._renderer = UIRenderer() + self._platform = None + self._collector = None # Lazy init + self._filter_pipeline = None # Lazy init + self._locator_builder = None # Lazy init + self._renderer = None # Lazy init def _get_driver(self) -> Any: if self._appium_lib is None: @@ -56,9 +53,11 @@ def _get_driver(self) -> Any: return self._driver def get_platform(self) -> str: - caps = self._get_driver().capabilities - platform = caps.get('platformName', '').lower() - return 'ios' if 'ios' in platform else 'android' + if self._platform is None: + caps = self._get_driver().capabilities + platform = caps.get('platformName', '').lower() + self._platform = 'ios' if 'ios' in platform else 'android' + return self._platform def get_screen_size(self) -> Dict[str, int]: size = self._get_driver().get_window_size() @@ -69,34 +68,43 @@ def get_ui_xml(self) -> str: def collect_ui_candidates(self, max_items: int = 50) -> List[Dict[str, Any]]: xml = self.get_ui_xml() - platform = self.get_platform() - - if platform == 'ios': - raise NotImplementedError("iOS not implemented yet") + collector = self._get_collector() + pipeline = self._get_filter_pipeline() - elements = self._android_collector.parse_xml(xml) - pipeline = AndroidFilterPipeline(self.get_screen_size()) + elements = collector.parse_xml(xml) filtered = pipeline.apply(elements) + filtered.sort( + key=lambda e: ( + bool(e.get('resource-id', '').strip()), + bool(e.get('content-desc', '').strip()), + bool(e.get('text', '').strip()), + e.get('clickable') == 'true', + ), + reverse=True + ) + return filtered[:max_items] def collect_all_elements(self) -> List[Dict[str, Any]]: xml = self.get_ui_xml() - platform = self.get_platform() - - if platform == 'ios': - raise NotImplementedError("iOS not implemented yet") - - return self._android_collector.parse_xml(xml) + collector = self._get_collector() + return collector.parse_xml(xml) - def build_locator_from_element(self, element: Dict[str, Any], robust: bool = False) -> str: - platform = self.get_platform() - self.locator_builder.set_platform(platform) - return self.locator_builder.build(element, robust=robust) + def build_locator_from_element(self, element: Dict[str, Any], strategy: str = 'auto') -> str: + """ + Args: + element: Dict with raw XML attributes + strategy: 'auto' | 'id_only' | 'bounds' | 'xpath_attrs' | 'xpath_all' + Returns: + Appium locator string + Example: build_locator_from_element(elem, 'id_only') -> 'id=com.android:id/button' + """ + return self._get_locator_builder().build(element, strategy=strategy) def render_ui_for_prompt(self, ui_elements: List[Dict[str, Any]]) -> str: platform = self.get_platform() - return self._renderer.render(ui_elements, platform=platform) + return self._get_renderer().render(ui_elements, platform=platform) def get_screenshot_base64(self) -> str: return self._get_driver().get_screenshot_as_base64() @@ -108,3 +116,38 @@ def embed_image_to_log(self, base64_screenshot: str, width: int = 400) -> None: def wait_for_page_stable(self, delay: float = 1.0) -> None: import time time.sleep(delay) + + def _get_locator_builder(self): + if self._locator_builder is None: + from Agent.platforms.locators import MobileLocatorBuilder + platform = self.get_platform() + self._locator_builder = MobileLocatorBuilder(platform=platform) + return self._locator_builder + + def _get_collector(self): + if self._collector is None: + platform = self.get_platform() + if platform == 'ios': + from Agent.platforms.collectors import IOSCollector + self._collector = IOSCollector() + else: + from Agent.platforms.collectors import AndroidCollector + self._collector = AndroidCollector() + return self._collector + + def _get_filter_pipeline(self): + if self._filter_pipeline is None: + platform = self.get_platform() + if platform == 'ios': + from Agent.platforms.filters.pipeline import FilterPipeline + self._filter_pipeline = FilterPipeline() + else: + from Agent.platforms.filters.android import AndroidFilterPipeline + self._filter_pipeline = AndroidFilterPipeline() + return self._filter_pipeline + + def _get_renderer(self): + if self._renderer is None: + from Agent.platforms.collectors import render_som + self._renderer = type('Renderer', (), {'render': lambda self, elements, platform: render_som('', elements)})() + return self._renderer From b0cfac5d814987cbd19f3ecd64c7e40b73cdea1d Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 02:38:32 +0100 Subject: [PATCH 11/19] handle small elements annotation and annotate on linux and windows --- Agent/platforms/collectors/som_renderer.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/Agent/platforms/collectors/som_renderer.py b/Agent/platforms/collectors/som_renderer.py index 702b471..8d0e3e6 100644 --- a/Agent/platforms/collectors/som_renderer.py +++ b/Agent/platforms/collectors/som_renderer.py @@ -44,10 +44,19 @@ def render_som( draw = ImageDraw.Draw(overlay) try: - #TODO: fix this for windows and linux ( pixelized font on those OS ) - font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14) - font_large = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) - except: + import platform + system = platform.system() + + if system == "Darwin": + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14) + font_large = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) + elif system == "Windows": + font = ImageFont.truetype("arial.ttf", 14) + font_large = ImageFont.truetype("arial.ttf", 24) + else: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14) + font_large = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 24) + except Exception: font = ImageFont.load_default() font_large = font From e68be3c27986b46692dbe12178796c8526f869ee Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:17:20 +0100 Subject: [PATCH 12/19] improved grounding with adding json items --- Agent/platforms/grounding/__init__.py | 12 ++ Agent/platforms/grounding/composer.py | 63 ++++++++ Agent/platforms/grounding/som/__init__.py | 5 + Agent/platforms/grounding/som/annotator.py | 130 ++++++++++++++++ Agent/platforms/grounding/som/serializer.py | 148 +++++++++++++++++++ Agent/platforms/grounding/text/__init__.py | 4 + Agent/platforms/grounding/text/serializer.py | 109 ++++++++++++++ 7 files changed, 471 insertions(+) create mode 100644 Agent/platforms/grounding/__init__.py create mode 100644 Agent/platforms/grounding/composer.py create mode 100644 Agent/platforms/grounding/som/__init__.py create mode 100644 Agent/platforms/grounding/som/annotator.py create mode 100644 Agent/platforms/grounding/som/serializer.py create mode 100644 Agent/platforms/grounding/text/__init__.py create mode 100644 Agent/platforms/grounding/text/serializer.py diff --git a/Agent/platforms/grounding/__init__.py b/Agent/platforms/grounding/__init__.py new file mode 100644 index 0000000..f946a42 --- /dev/null +++ b/Agent/platforms/grounding/__init__.py @@ -0,0 +1,12 @@ +from Agent.platforms.grounding.som.annotator import annotate_screenshot +from Agent.platforms.grounding.som.serializer import SomSerializer +from Agent.platforms.grounding.text.serializer import TextSerializer +from Agent.platforms.grounding.composer import SomComposer + +__all__ = [ + 'annotate_screenshot', + 'SomSerializer', + 'TextSerializer', + 'SomComposer', +] + diff --git a/Agent/platforms/grounding/composer.py b/Agent/platforms/grounding/composer.py new file mode 100644 index 0000000..99b5fe7 --- /dev/null +++ b/Agent/platforms/grounding/composer.py @@ -0,0 +1,63 @@ +from typing import Any, Dict, List, Optional +from Agent.platforms.grounding.som.serializer import SomSerializer +from Agent.platforms.grounding.som.annotator import annotate_screenshot + + +class SomComposer: + """ + Orchestrates SoM components (visual annotation + text legend). + + Args: + platform: "android" (implemented), "ios"/"web" (future) + screen_width: Screen width in pixels + screen_height: Screen height in pixels + """ + + def __init__( + self, + platform: str = "android", + screen_width: int = 1080, + screen_height: int = 1920 + ): + self.platform = platform + self.serializer = SomSerializer(platform, screen_width, screen_height) + + def compose( + self, + screenshot_base64: Optional[str], + elements: List[Dict[str, Any]], + config: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Args: + screenshot_base64: Base64 screenshot (required if visual_annotation=True) + elements: List of UI elements + config: {visual_annotation, text_format, output_type, include_screenshot} + Returns: + {annotated_image_base64, elements_json, text_legend} + """ + if config is None: + config = {} + + visual_annotation = config.get('visual_annotation', True) + text_format = config.get('text_format', 'compact') + output_type = config.get('output_type', 'text') + include_screenshot = config.get('include_screenshot', True) + + result = {} + + if visual_annotation: + if not screenshot_base64: + raise ValueError("screenshot_base64 required when visual_annotation=True") + + annotated_image = annotate_screenshot(screenshot_base64, elements) + if include_screenshot: + result['annotated_image_base64'] = annotated_image + + if output_type == 'json': + result['elements_json'] = self.serializer.serialize(elements, output_type='json') + elif output_type == 'text': + result['text_legend'] = self.serializer.serialize(elements, format=text_format, output_type='text') + + return result + diff --git a/Agent/platforms/grounding/som/__init__.py b/Agent/platforms/grounding/som/__init__.py new file mode 100644 index 0000000..ab5575b --- /dev/null +++ b/Agent/platforms/grounding/som/__init__.py @@ -0,0 +1,5 @@ +from Agent.platforms.grounding.som.annotator import annotate_screenshot +from Agent.platforms.grounding.som.serializer import SomSerializer + +__all__ = ['annotate_screenshot', 'SomSerializer'] + diff --git a/Agent/platforms/grounding/som/annotator.py b/Agent/platforms/grounding/som/annotator.py new file mode 100644 index 0000000..54aafba --- /dev/null +++ b/Agent/platforms/grounding/som/annotator.py @@ -0,0 +1,130 @@ +""" +SoM Visual Annotator. + +Draws numbered bounding boxes on screenshots for visual grounding. +""" + +import base64 +import io +from typing import Any, Dict, List, Tuple + +from PIL import Image, ImageDraw, ImageFont + + +COLOR_DOM = (34, 197, 94) +COLOR_OMNIPARSER = (249, 115, 22) +COLOR_DEFAULT = (59, 130, 246) + + +def annotate_screenshot( + screenshot_base64: str, + elements: List[Dict[str, Any]], + source_key: str = "source", +) -> str: + """ + Args: + screenshot_base64: Base64 encoded PNG/JPEG + elements: List with 'bbox' key {x, y, width, height} + source_key: Key to check for source type + Returns: + Base64 of annotated image + """ + img_bytes = base64.b64decode(screenshot_base64) + img = Image.open(io.BytesIO(img_bytes)).convert("RGBA") + + overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) + draw = ImageDraw.Draw(overlay) + + try: + import platform + system = platform.system() + + if system == "Darwin": + font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14) + font_large = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) + elif system == "Windows": + font = ImageFont.truetype("arial.ttf", 14) + font_large = ImageFont.truetype("arial.ttf", 24) + else: + font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14) + font_large = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 24) + except Exception: + font = ImageFont.load_default() + font_large = font + + for idx, element in enumerate(elements, start=1): + bbox = element.get("bbox") + if not bbox: + continue + + x = bbox.get("x", 0) + y = bbox.get("y", 0) + w = bbox.get("width", 0) + h = bbox.get("height", 0) + + if w <= 0 or h <= 0: + continue + + source = element.get(source_key, "dom") + color = COLOR_DOM if source == "dom" else COLOR_OMNIPARSER if source == "omniparser" else COLOR_DEFAULT + + margin = 4 + if w <= 2 * margin: + margin = max(0, w // 2 - 1) + if h <= 2 * margin: + margin = min(margin, max(0, h // 2 - 1)) + + box_x1 = x + margin + box_y1 = y + margin + box_x2 = x + w - margin + box_y2 = y + h - margin + + if box_x2 <= box_x1: + box_x2 = box_x1 + 1 + if box_y2 <= box_y1: + box_y2 = box_y1 + 1 + + draw.rectangle( + [box_x1, box_y1, box_x2, box_y2], + outline=color + (255,), + width=2 + ) + + label = str(idx) + label_bbox = draw.textbbox((0, 0), label, font=font_large) + label_w = label_bbox[2] - label_bbox[0] + 12 + label_h = label_bbox[3] - label_bbox[1] + 8 + + label_x = box_x1 + 5 + label_y = box_y1 + 5 + + draw.rectangle( + [label_x, label_y, label_x + label_w, label_y + label_h], + fill=color + (230,) + ) + + draw.text( + (label_x + 6, label_y + 4), + label, + fill=(255, 255, 255, 255), + font=font_large, + stroke_width=2, + stroke_fill=(0, 0, 0, 255) + ) + + result = Image.alpha_composite(img, overlay).convert("RGB") + + buffer = io.BytesIO() + result.save(buffer, format="PNG") + return base64.b64encode(buffer.getvalue()).decode("utf-8") + + +def bbox_center(bbox: Dict[str, int]) -> Tuple[int, int]: + if not bbox: + return (0, 0) + x = bbox.get("x", 0) + y = bbox.get("y", 0) + w = bbox.get("width", 0) + h = bbox.get("height", 0) + return (x + w // 2, y + h // 2) + diff --git a/Agent/platforms/grounding/som/serializer.py b/Agent/platforms/grounding/som/serializer.py new file mode 100644 index 0000000..d0557b5 --- /dev/null +++ b/Agent/platforms/grounding/som/serializer.py @@ -0,0 +1,148 @@ +from typing import Any, Dict, List +import json + + +class SomSerializer: + """ + Serializes UI elements as text or JSON for SoM prompts. + + Args: + platform: "android" (implemented), "ios"/"web" (future) + screen_width: Screen width in pixels + screen_height: Screen height in pixels + """ + + def __init__(self, platform: str = "android", screen_width: int = 1080, screen_height: int = 1920): + self.platform = platform + self.screen_width = screen_width + self.screen_height = screen_height + + def serialize( + self, + elements: List[Dict[str, Any]], + format: str = "compact", + output_type: str = "text" + ) -> str: + """ + Args: + elements: List of UI elements + format: "compact", "detailed", "minimal" (for text mode) + output_type: "text" or "json" + Returns: + Formatted string + """ + if output_type == "json": + return self._to_json(elements) + else: + return self._to_text(elements, format) + + def _to_text(self, elements: List[Dict[str, Any]], format: str) -> str: + if not elements: + return "(no elements)" + + lines = [] + for idx, elem in enumerate(elements, start=1): + if format == "minimal": + line = self._minimal(idx, elem) + elif format == "detailed": + line = self._detailed(idx, elem) + else: + line = self._compact(idx, elem) + lines.append(line) + + return "\n".join(lines) + + def _minimal(self, idx: int, elem: Dict[str, Any]) -> str: + text = self._get_text(elem) + return f"[{idx}] {text}" if text else f"[{idx}] (no text)" + + def _compact(self, idx: int, elem: Dict[str, Any]) -> str: + class_name = elem.get("class_name") or elem.get("class", "") + short_class = class_name.split('.')[-1] if '.' in class_name else class_name + text = self._get_text(elem) + position = self._get_position(elem.get("bbox", {})) + return f"[{idx}] {short_class}: {text} @{position}" + + def _detailed(self, idx: int, elem: Dict[str, Any]) -> str: + parts = [] + class_name = elem.get("class_name") or elem.get("class", "") + short_class = class_name.split('.')[-1] if '.' in class_name else class_name + parts.append(f"[{idx}] {short_class}") + + text = self._get_text(elem) + if text: + parts.append(f"text='{text}'") + + resource_id = self._get_resource_id(elem) + if resource_id: + parts.append(f"id='{resource_id}'") + + content_desc = self._get_content_desc(elem) + if content_desc: + parts.append(f"desc='{content_desc}'") + + bbox = elem.get("bbox", {}) + if bbox: + position = self._get_position(bbox) + w = bbox.get("width", 0) + h = bbox.get("height", 0) + parts.append(f"pos={position} size={w}x{h}") + + return " | ".join(parts) + + def _to_json(self, elements: List[Dict[str, Any]]) -> str: + if self.platform != "android": + raise NotImplementedError(f"JSON not implemented for: {self.platform}") + + boxes = [] + for idx, elem in enumerate(elements, start=1): + bbox_norm = elem.get("bbox_normalized", {}) + if not bbox_norm: + bbox_norm = self._normalize(elem.get("bbox", {})) + + boxes.append({ + "mark_id": idx, + "class_name": elem.get("class_name") or elem.get("class", ""), + "text": self._get_text(elem) or "", + "resource_id": self._get_resource_id(elem) or "", + "content_desc": self._get_content_desc(elem) or "", + "bbox": bbox_norm + }) + + return json.dumps({ + "screen": {"width": self.screen_width, "height": self.screen_height}, + "som_version": "1.0", + "boxes": boxes + }, indent=2, ensure_ascii=False) + + def _normalize(self, bbox: Dict[str, int]) -> Dict[str, float]: + if not bbox or self.screen_width <= 0 or self.screen_height <= 0: + return {} + return { + 'x': round(bbox.get('x', 0) / self.screen_width, 4), + 'y': round(bbox.get('y', 0) / self.screen_height, 4), + 'width': round(bbox.get('width', 0) / self.screen_width, 4), + 'height': round(bbox.get('height', 0) / self.screen_height, 4), + } + + def _get_text(self, elem: Dict[str, Any]) -> str: + text = elem.get("text", "") + if isinstance(text, str): + return text.replace("\n", " ").strip()[:40] + return "" + + def _get_resource_id(self, elem: Dict[str, Any]) -> str: + return elem.get("resource_id") or elem.get("resource-id", "") + + def _get_content_desc(self, elem: Dict[str, Any]) -> str: + return elem.get("content_desc") or elem.get("content-desc", "") + + def _get_position(self, bbox: Dict[str, int]) -> str: + if not bbox: + return "unknown" + y = bbox.get("y", 0) + x = bbox.get("x", 0) + pos = "top" if y < 400 else "mid" if y < 1200 else "bot" + side = "L" if x < 300 else "C" if x < 700 else "R" + return f"{pos}-{side}" + diff --git a/Agent/platforms/grounding/text/__init__.py b/Agent/platforms/grounding/text/__init__.py new file mode 100644 index 0000000..6e02dc7 --- /dev/null +++ b/Agent/platforms/grounding/text/__init__.py @@ -0,0 +1,4 @@ +from Agent.platforms.grounding.text.serializer import TextSerializer + +__all__ = ['TextSerializer'] + diff --git a/Agent/platforms/grounding/text/serializer.py b/Agent/platforms/grounding/text/serializer.py new file mode 100644 index 0000000..c7407dd --- /dev/null +++ b/Agent/platforms/grounding/text/serializer.py @@ -0,0 +1,109 @@ +from typing import Any, Dict, List + + +class TextSerializer: + """Serializes UI elements as numbered text list.""" + + def serialize(self, elements: List[Dict[str, Any]], platform: str = "android") -> str: + """ + Args: + elements: List of UI element dictionaries + platform: 'android', 'ios', or 'web' + Returns: + Formatted numbered text + """ + if not elements: + return "(no elements)" + + # is_mobile = platform in ("android", "ios") + # max_items = 50 if is_mobile else 150 + max_items = 50 + + lines = [] + for i, el in enumerate(elements[:max_items], 1): + # if platform == "ios": + # line = self._ios(i, el) + # elif platform == "android": + # line = self._android(i, el) + # else: + # line = self._web(i, el) + line = self._android(i, el) + lines.append(line) + + return "\n".join(lines) + + # def _web(self, idx: int, el: Dict[str, Any]) -> str: + # parts = [] + # tag = el.get('class_name', '') or el.get('tag', 'unknown') + # elem_type = el.get('type', '') + # if elem_type and elem_type not in ['text', '']: + # parts.append(f"<{tag} type='{elem_type}'>") + # else: + # parts.append(f"<{tag}>") + # + # if el.get("aria_label"): + # parts.append(f"aria-label='{el['aria_label']}'") + # if el.get("placeholder"): + # parts.append(f"placeholder='{el['placeholder']}'") + # if el.get("text"): + # parts.append(f"text='{el['text']}'") + # if el.get("resource_id"): + # parts.append(f"id='{el['resource_id']}'") + # if el.get("name"): + # parts.append(f"name='{el['name']}'") + # + # return f"{idx}. {' | '.join(parts)}" + + def _android(self, idx: int, el: Dict[str, Any]) -> str: + parts = [] + class_name = el.get('class_name') or el.get('class', 'unknown') + short_class = class_name.split('.')[-1] if '.' in class_name else class_name + parts.append(f"[{short_class}]") + + if el.get("text"): + parts.append(f"text='{el['text']}'") + if el.get("resource_id") or el.get("resource-id"): + parts.append(f"id='{el.get('resource_id') or el.get('resource-id')}'") + + content_desc = el.get("accessibility_label", '') or el.get("content_desc", '') or el.get("content-desc", '') + if content_desc: + parts.append(f"desc='{content_desc}'") + + bbox = el.get("bbox", {}) + if bbox: + y = bbox.get("y", 0) + x = bbox.get("x", 0) + w = bbox.get("width", 0) + h = bbox.get("height", 0) + pos = "top" if y < 400 else "middle" if y < 1200 else "bottom" + side = "left" if x < 300 else "center" if x < 700 else "right" + parts.append(f"pos={pos}-{side} size={w}x{h}") + + return f"{idx}. {' | '.join(parts)}" + + # def _ios(self, idx: int, el: Dict[str, Any]) -> str: + # parts = [] + # class_name = el.get('class_name', 'unknown') + # short_class = class_name.replace('XCUIElementType', '') if 'XCUIElementType' in class_name else class_name + # parts.append(f"[{short_class}]") + # + # if el.get("text"): + # parts.append(f"text='{el['text']}'") + # if el.get("resource_id"): + # parts.append(f"name='{el['resource_id']}'") + # + # label = el.get("accessibility_label", '') or el.get("label", '') + # if label: + # parts.append(f"label='{label}'") + # + # bbox = el.get("bbox", {}) + # if bbox: + # y = bbox.get("y", 0) + # x = bbox.get("x", 0) + # w = bbox.get("width", 0) + # h = bbox.get("height", 0) + # pos = "top" if y < 400 else "middle" if y < 1200 else "bottom" + # side = "left" if x < 300 else "center" if x < 700 else "right" + # parts.append(f"pos={pos}-{side} size={w}x{h}") + # + # return f"{idx}. {' | '.join(parts)}" From 864a7fe2c7c226b2bce18ca10e8007503055a9cd Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:17:36 +0100 Subject: [PATCH 13/19] mode to grounding folder --- Agent/platforms/collectors/som_renderer.py | 148 --------------------- 1 file changed, 148 deletions(-) delete mode 100644 Agent/platforms/collectors/som_renderer.py diff --git a/Agent/platforms/collectors/som_renderer.py b/Agent/platforms/collectors/som_renderer.py deleted file mode 100644 index 8d0e3e6..0000000 --- a/Agent/platforms/collectors/som_renderer.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Set-of-Mark (SoM) Renderer. - -Draws numbered bounding boxes on screenshots for visual grounding. -""" - -import base64 -import io -from typing import Any, Dict, List, Tuple - -from PIL import Image, ImageDraw, ImageFont - - -# Colors for different sources -COLOR_DOM = (34, 197, 94) # Green for DOM elements (has locator) -COLOR_OMNIPARSER = (249, 115, 22) # Orange for OmniParser (click-only) -COLOR_DEFAULT = (59, 130, 246) # Blue default - - -def render_som( - screenshot_base64: str, - elements: List[Dict[str, Any]], - source_key: str = "source", -) -> str: - """ - Draw numbered bounding boxes on screenshot. - - Args: - screenshot_base64: Base64 encoded PNG/JPEG - elements: List with 'bbox' key {x, y, width, height} and optional source - source_key: Key to check for source type ("dom" or "omniparser") - - Returns: - Base64 of annotated image - - Example: - >>> elements = [{'text': 'Search', 'bbox': {'x': 10, 'y': 20, 'width': 100, 'height': 30}, 'source': 'dom'}] - >>> annotated = render_som(screenshot_b64, elements) - """ - img_bytes = base64.b64decode(screenshot_base64) - img = Image.open(io.BytesIO(img_bytes)).convert("RGBA") - - overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) - draw = ImageDraw.Draw(overlay) - - try: - import platform - system = platform.system() - - if system == "Darwin": - font = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 14) - font_large = ImageFont.truetype("/System/Library/Fonts/Helvetica.ttc", 24) - elif system == "Windows": - font = ImageFont.truetype("arial.ttf", 14) - font_large = ImageFont.truetype("arial.ttf", 24) - else: - font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 14) - font_large = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 24) - except Exception: - font = ImageFont.load_default() - font_large = font - - for idx, element in enumerate(elements, start=1): - bbox = element.get("bbox") - if not bbox: - continue - - x = bbox.get("x", 0) - y = bbox.get("y", 0) - w = bbox.get("width", 0) - h = bbox.get("height", 0) - - if w <= 0 or h <= 0: - continue - - source = element.get(source_key, "dom") - color = COLOR_DOM if source == "dom" else COLOR_OMNIPARSER if source == "omniparser" else COLOR_DEFAULT - - # Apply margin to create visual spacing, but ensure box remains valid - margin = 4 - # Adjust margin if element is too small - if w <= 2 * margin: - margin = max(0, w // 2 - 1) - if h <= 2 * margin: - margin = min(margin, max(0, h // 2 - 1)) - - box_x1 = x + margin - box_y1 = y + margin - box_x2 = x + w - margin - box_y2 = y + h - margin - - # Ensure valid coordinates (x2 > x1, y2 > y1) - if box_x2 <= box_x1: - box_x2 = box_x1 + 1 - if box_y2 <= box_y1: - box_y2 = box_y1 + 1 - - # Draw box with transparency - draw.rectangle( - [box_x1, box_y1, box_x2, box_y2], - outline=color + (255,), - width=2 - ) - - # Draw label background (top-left inside box) - label = str(idx) - label_bbox = draw.textbbox((0, 0), label, font=font_large) - label_w = label_bbox[2] - label_bbox[0] + 12 - label_h = label_bbox[3] - label_bbox[1] + 8 - - label_x = box_x1 + 5 - label_y = box_y1 + 5 - - draw.rectangle( - [label_x, label_y, label_x + label_w, label_y + label_h], - fill=color + (230,) - ) - - # Draw label text with stroke for better contrast - draw.text( - (label_x + 6, label_y + 4), - label, - fill=(255, 255, 255, 255), - font=font_large, - stroke_width=2, - stroke_fill=(0, 0, 0, 255) - ) - - result = Image.alpha_composite(img, overlay).convert("RGB") - - buffer = io.BytesIO() - result.save(buffer, format="PNG") - return base64.b64encode(buffer.getvalue()).decode("utf-8") - - -def bbox_center(bbox: Dict[str, int]) -> Tuple[int, int]: - if not bbox: - return (0, 0) - x = bbox.get("x", 0) - y = bbox.get("y", 0) - w = bbox.get("width", 0) - h = bbox.get("height", 0) - return (x + w // 2, y + h // 2) - - - - - From ab8b5385fb311ed9ca4f1d214dcd69811674c8c4 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:17:58 +0100 Subject: [PATCH 14/19] moved to grounding + folder to externilize prompts --- Agent/ai/prompts/__init__.py | 5 +- Agent/ai/prompts/renderer.py | 123 ----------------------------------- 2 files changed, 1 insertion(+), 127 deletions(-) delete mode 100644 Agent/ai/prompts/renderer.py diff --git a/Agent/ai/prompts/__init__.py b/Agent/ai/prompts/__init__.py index 162137d..960b2ff 100644 --- a/Agent/ai/prompts/__init__.py +++ b/Agent/ai/prompts/__init__.py @@ -1,4 +1 @@ -from Agent.ai.prompts.renderer import UIRenderer - -__all__ = ["UIRenderer"] - +# TODO: prompt templates diff --git a/Agent/ai/prompts/renderer.py b/Agent/ai/prompts/renderer.py deleted file mode 100644 index 5c8fc7c..0000000 --- a/Agent/ai/prompts/renderer.py +++ /dev/null @@ -1,123 +0,0 @@ -from typing import Any, Dict, List - - -class UIRenderer: - """Renders UI elements as text for AI prompts.""" - - def render(self, elements: List[Dict[str, Any]], platform: str = "web") -> str: - """ - Render UI elements as numbered text list for AI prompt. - - Args: - elements: List of UI element dictionaries - platform: 'web', 'android', or 'ios' - Returns: - Formatted string with numbered elements - """ - if not elements: - return "(no UI elements found)" - - is_mobile = platform in ("android", "ios") - max_items = 50 if is_mobile else 150 - - rendered = [] - for i, el in enumerate(elements[:max_items], 1): - if platform == "ios": - line = self._render_ios_element(i, el) - elif platform == "android": - line = self._render_android_element(i, el) - else: - line = self._render_web_element(i, el) - rendered.append(line) - - return "\n".join(rendered) - - def _render_web_element(self, index: int, el: Dict[str, Any]) -> str: - """Render a single web element.""" - parts = [] - - tag = el.get('class_name', '') or el.get('tag', 'unknown') - elem_type = el.get('type', '') - if elem_type and elem_type not in ['text', '']: - parts.append(f"<{tag} type='{elem_type}'>") - else: - parts.append(f"<{tag}>") - - aria_label = el.get("aria_label", '') - if aria_label: - parts.append(f"aria-label='{aria_label}'") - - placeholder = el.get("placeholder", '') - if placeholder: - parts.append(f"placeholder='{placeholder}'") - - if el.get("text"): - parts.append(f"text='{el['text']}'") - - if el.get("resource_id"): - parts.append(f"id='{el['resource_id']}'") - - if el.get("name"): - parts.append(f"name='{el['name']}'") - - return f"{index}. {' | '.join(parts)}" - - def _render_android_element(self, index: int, el: Dict[str, Any]) -> str: - """Render a single Android element.""" - parts = [] - - class_name = el.get('class_name', 'unknown') - short_class = class_name.split('.')[-1] if '.' in class_name else class_name - parts.append(f"[{short_class}]") - - if el.get("text"): - parts.append(f"text='{el['text']}'") - - if el.get("resource_id"): - parts.append(f"id='{el['resource_id']}'") - - content_desc = el.get("accessibility_label", '') or el.get("content_desc", '') - if content_desc: - parts.append(f"desc='{content_desc}'") - - bbox = el.get("bbox", {}) - if bbox: - y = bbox.get("y", 0) - x = bbox.get("x", 0) - w = bbox.get("width", 0) - h = bbox.get("height", 0) - pos = "top" if y < 400 else "middle" if y < 1200 else "bottom" - side = "left" if x < 300 else "center" if x < 700 else "right" - parts.append(f"pos={pos}-{side} size={w}x{h}") - - return f"{index}. {' | '.join(parts)}" - - def _render_ios_element(self, index: int, el: Dict[str, Any]) -> str: - """Render a single iOS element.""" - parts = [] - - class_name = el.get('class_name', 'unknown') - short_class = class_name.replace('XCUIElementType', '') if 'XCUIElementType' in class_name else class_name - parts.append(f"[{short_class}]") - - if el.get("text"): - parts.append(f"text='{el['text']}'") - - if el.get("resource_id"): - parts.append(f"name='{el['resource_id']}'") - - label = el.get("accessibility_label", '') or el.get("label", '') - if label: - parts.append(f"label='{label}'") - - bbox = el.get("bbox", {}) - if bbox: - y = bbox.get("y", 0) - x = bbox.get("x", 0) - w = bbox.get("width", 0) - h = bbox.get("height", 0) - pos = "top" if y < 400 else "middle" if y < 1200 else "bottom" - side = "left" if x < 300 else "center" if x < 700 else "right" - parts.append(f"pos={pos}-{side} size={w}x{h}") - - return f"{index}. {' | '.join(parts)}" From 3528ae1953e17bf113e6b318dcda8d85337e6ff4 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:19:17 +0100 Subject: [PATCH 15/19] fixing init --- Agent/platforms/collectors/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Agent/platforms/collectors/__init__.py b/Agent/platforms/collectors/__init__.py index 0d6b36b..ad510fd 100644 --- a/Agent/platforms/collectors/__init__.py +++ b/Agent/platforms/collectors/__init__.py @@ -7,11 +7,11 @@ from Agent.platforms.collectors.android_collector import AndroidCollector from Agent.platforms.collectors.ios_collector import IOSCollector -from Agent.platforms.collectors.som_renderer import render_som, bbox_center +from Agent.platforms.grounding.som.annotator import annotate_screenshot, bbox_center __all__ = [ 'AndroidCollector', 'IOSCollector', - 'render_som', + 'annotate_screenshot', 'bbox_center', ] From 03e6e7039a7594e3a408539a12fe2b20793aca2d Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:19:32 +0100 Subject: [PATCH 16/19] adapt mobile connector --- Agent/platforms/_mobileconnector.py | 34 +++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/Agent/platforms/_mobileconnector.py b/Agent/platforms/_mobileconnector.py index c81fee3..d710352 100644 --- a/Agent/platforms/_mobileconnector.py +++ b/Agent/platforms/_mobileconnector.py @@ -74,6 +74,9 @@ def collect_ui_candidates(self, max_items: int = 50) -> List[Dict[str, Any]]: elements = collector.parse_xml(xml) filtered = pipeline.apply(elements) + screen_size = self.get_screen_size() + self._add_normalized_bbox(filtered, screen_size) + filtered.sort( key=lambda e: ( bool(e.get('resource-id', '').strip()), @@ -89,7 +92,30 @@ def collect_ui_candidates(self, max_items: int = 50) -> List[Dict[str, Any]]: def collect_all_elements(self) -> List[Dict[str, Any]]: xml = self.get_ui_xml() collector = self._get_collector() - return collector.parse_xml(xml) + elements = collector.parse_xml(xml) + + screen_size = self.get_screen_size() + self._add_normalized_bbox(elements, screen_size) + + return elements + + def _add_normalized_bbox(self, elements: List[Dict[str, Any]], screen_size: Dict[str, int]) -> None: + """Add bbox_normalized to each element.""" + sw = screen_size.get('width', 0) + sh = screen_size.get('height', 0) + + if sw <= 0 or sh <= 0: + return + + for elem in elements: + bbox = elem.get('bbox', {}) + if bbox: + elem['bbox_normalized'] = { + 'x': round(bbox.get('x', 0) / sw, 4), + 'y': round(bbox.get('y', 0) / sh, 4), + 'width': round(bbox.get('width', 0) / sw, 4), + 'height': round(bbox.get('height', 0) / sh, 4), + } def build_locator_from_element(self, element: Dict[str, Any], strategy: str = 'auto') -> str: """ @@ -104,7 +130,7 @@ def build_locator_from_element(self, element: Dict[str, Any], strategy: str = 'a def render_ui_for_prompt(self, ui_elements: List[Dict[str, Any]]) -> str: platform = self.get_platform() - return self._get_renderer().render(ui_elements, platform=platform) + return self._get_renderer().serialize(ui_elements, platform=platform) def get_screenshot_base64(self) -> str: return self._get_driver().get_screenshot_as_base64() @@ -148,6 +174,6 @@ def _get_filter_pipeline(self): def _get_renderer(self): if self._renderer is None: - from Agent.platforms.collectors import render_som - self._renderer = type('Renderer', (), {'render': lambda self, elements, platform: render_som('', elements)})() + from Agent.platforms.grounding.text.serializer import TextSerializer + self._renderer = TextSerializer() return self._renderer From 378d4649bccb5e93a5c11bdf6423783c146a2c58 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:19:44 +0100 Subject: [PATCH 17/19] adding som config variables --- Agent/agent_engine.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Agent/agent_engine.py b/Agent/agent_engine.py index dbba091..0983dcf 100644 --- a/Agent/agent_engine.py +++ b/Agent/agent_engine.py @@ -15,6 +15,12 @@ class AgentEngine: """Core engine for AI-driven Android test automation.""" + SOM_CONFIG = { + 'visual_annotation': True, + 'text_format': 'compact', + 'output_type': 'text' + } + def __init__( self, llm_client: str = "openai", @@ -176,6 +182,7 @@ def do(self, instruction: str) -> None: llm_input_format=self.llm_input_format, screenshot_base64=screenshot_base64, annotated_image_path=annotated_image_path, + som_config=self.SOM_CONFIG if self.llm_input_format == "som" else None, ) if annotated_image_path: logger.info(f"Annotated image: {annotated_image_path}") From 5fb8f9cfba1e69b8d7e2f7ec4cc75b54281f7779 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:30:51 +0100 Subject: [PATCH 18/19] added fallback to screensize --- Agent/platforms/_mobileconnector.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Agent/platforms/_mobileconnector.py b/Agent/platforms/_mobileconnector.py index d710352..d794b49 100644 --- a/Agent/platforms/_mobileconnector.py +++ b/Agent/platforms/_mobileconnector.py @@ -60,8 +60,13 @@ def get_platform(self) -> str: return self._platform def get_screen_size(self) -> Dict[str, int]: - size = self._get_driver().get_window_size() - return {'width': size.get('width', 0), 'height': size.get('height', 0)} + try: + size = self._get_driver().get_window_size() + return {'width': size.get('width', 0), 'height': size.get('height', 0)} + #TODO: see if this is really needed and if there is better fallback + except Exception: + logger.warn("⚠️ Could not get screen size, using fallback 1080x1920") + return {'width': 1080, 'height': 1920} def get_ui_xml(self) -> str: return self._get_driver().page_source From a988a084aceca0cfbeb51ec12b5843981a6c428e Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 12:31:03 +0100 Subject: [PATCH 19/19] updated prompts --- Agent/ai/_promptcomposer.py | 188 ++++++++++++++++++++++++------------ 1 file changed, 124 insertions(+), 64 deletions(-) diff --git a/Agent/ai/_promptcomposer.py b/Agent/ai/_promptcomposer.py index df3324c..2cb6132 100644 --- a/Agent/ai/_promptcomposer.py +++ b/Agent/ai/_promptcomposer.py @@ -1,6 +1,7 @@ from typing import List, Dict, Optional, Any from Agent.tools.registry import ToolRegistry from Agent.tools.base import ToolCategory +from Agent.platforms.grounding import SomComposer from robot.api import logger import base64 import os @@ -46,6 +47,7 @@ def compose_do_messages( llm_input_format: str = "text", screenshot_base64: Optional[str] = None, annotated_image_path: Optional[str] = None, + som_config: Optional[Dict[str, Any]] = None, ) -> List[Dict[str, Any]]: """Build DO action messages using tool calling approach. @@ -57,6 +59,12 @@ def compose_do_messages( llm_input_format: 'text' or 'som' screenshot_base64: Screenshot (required for SoM mode) annotated_image_path: Pre-annotated image from OmniParser + som_config: SoM configuration dict { + 'visual_annotation': True/False, + 'text_format': 'compact'/'detailed'/'minimal', + 'output_type': 'text'/'json', + 'include_screenshot': True/False + } """ # Base system prompt is_mobile = platform in ("android", "ios") @@ -64,6 +72,14 @@ def compose_do_messages( system_content = ( "You are a MOBILE app test automation engine (Appium).\n" "Your job: analyze the instruction and call the appropriate function to interact with the mobile UI.\n" + "\n⚠️ CRITICAL TOOL SELECTION:\n" + "- IF instruction says 'click', 'tap', 'select', 'choose' → ALWAYS use tap_element(index)\n" + "- scroll/swipe tools are ONLY for navigation - NEVER use them to click/tap\n" + "\n⚠️ IMPORTANT:\n" + "ALL tools have a 'reasoning' parameter. You MUST provide a brief explanation (1 sentence) of:\n" + "- Which element you chose and why (for element-based actions)\n" + "- Why this action matches the instruction (for all actions)\n" + "Example: {\"element_index\": 5, \"reasoning\": \"Clicking the search icon at the top right to open search\"}\n" ) if element_source == "vision": @@ -75,94 +91,138 @@ def compose_do_messages( ) else: system_content += ( - "\nUSE LOCATOR TOOLS:\n" - "1. FOR TEXT INPUT: input_text(element_index, text) - select from numbered list\n" - "2. FOR CLICKING: tap_element(index) - select from numbered list\n" - "3. OTHER: scroll_down(), swipe_left/right/up(), long_press(index), hide_keyboard(), go_back()\n" + "\n🎯 TOOL SELECTION RULES:\n" + "1. IF element is VISIBLE in the UI list → USE tap_element(index) to click it\n" + "2. IF you need to type text → USE input_text(index, text)\n" + "3. IF target element is NOT in the list → USE scroll_down/swipe_up to reveal it\n" + "4. NEVER use scroll/swipe when the target element is already visible!\n" + "5. scroll_down, swipe_up, swipe_left, swipe_right are ONLY for navigation - NOT for clicking!\n" + "6. To click ANY element from the list, ALWAYS use tap_element(index)\n" + "\nCRITICAL NOTES:\n" + "- The screenshot shows NUMBERED bounding boxes. Use what you SEE in the image!\n" + "- tap_element() clicks by COORDINATES - you CAN tap ANY visible element, even if not marked as clickable\n" + "- If you see the target element on screen, CLICK IT directly with tap_element()\n" + "- Search suggestions, list items, buttons = ALL require tap_element()\n" ) system_content += ( "\nIMPORTANT: You are working with MOBILE apps (Android/iOS), NOT web browsers." ) - else: - system_content = ( - "You are a WEB test automation engine.\n" - "Your job: analyze the instruction and call the appropriate function to interact with the web page.\n" - ) - - if element_source == "vision": - system_content += ( - "\nUSE VISUAL TOOLS:\n" - "- click_visual_element(description): Click by visual description\n" - "- input_text_visual(description, text): Input text by visual description\n" - "- hover_visual(description): Hover by visual description\n" - "- double_click_visual(description): Double click by visual description\n" - "- Elements were detected using computer vision (OmniParser)\n" - ) - else: - system_content += ( - "\nUSE LOCATOR TOOLS:\n" - "1. FOR TEXT INPUT: input_text(index, text) for or