Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Agent/agent_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
class AgentEngine:
"""Core engine for AI-driven Android test automation."""

SOM_CONFIG = {
'visual_annotation': True,
'text_format': 'compact',
'output_type': 'text'
}

def __init__(
self,
llm_client: str = "openai",
Expand Down Expand Up @@ -176,6 +182,7 @@ def do(self, instruction: str) -> None:
llm_input_format=self.llm_input_format,
screenshot_base64=screenshot_base64,
annotated_image_path=annotated_image_path,
som_config=self.SOM_CONFIG if self.llm_input_format == "som" else None,
)
if annotated_image_path:
logger.info(f"Annotated image: {annotated_image_path}")
Expand Down
188 changes: 124 additions & 64 deletions Agent/ai/_promptcomposer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List, Dict, Optional, Any
from Agent.tools.registry import ToolRegistry
from Agent.tools.base import ToolCategory
from Agent.platforms.grounding import SomComposer
from robot.api import logger
import base64
import os
Expand Down Expand Up @@ -46,6 +47,7 @@ def compose_do_messages(
llm_input_format: str = "text",
screenshot_base64: Optional[str] = None,
annotated_image_path: Optional[str] = None,
som_config: Optional[Dict[str, Any]] = None,
) -> List[Dict[str, Any]]:
"""Build DO action messages using tool calling approach.

Expand All @@ -57,13 +59,27 @@ def compose_do_messages(
llm_input_format: 'text' or 'som'
screenshot_base64: Screenshot (required for SoM mode)
annotated_image_path: Pre-annotated image from OmniParser
som_config: SoM configuration dict {
'visual_annotation': True/False,
'text_format': 'compact'/'detailed'/'minimal',
'output_type': 'text'/'json',
'include_screenshot': True/False
}
"""
# Base system prompt
is_mobile = platform in ("android", "ios")
if is_mobile:
system_content = (
"You are a MOBILE app test automation engine (Appium).\n"
"Your job: analyze the instruction and call the appropriate function to interact with the mobile UI.\n"
"\n⚠️ CRITICAL TOOL SELECTION:\n"
"- IF instruction says 'click', 'tap', 'select', 'choose' → ALWAYS use tap_element(index)\n"
"- scroll/swipe tools are ONLY for navigation - NEVER use them to click/tap\n"
"\n⚠️ IMPORTANT:\n"
"ALL tools have a 'reasoning' parameter. You MUST provide a brief explanation (1 sentence) of:\n"
"- Which element you chose and why (for element-based actions)\n"
"- Why this action matches the instruction (for all actions)\n"
"Example: {\"element_index\": 5, \"reasoning\": \"Clicking the search icon at the top right to open search\"}\n"
)

if element_source == "vision":
Expand All @@ -75,94 +91,138 @@ def compose_do_messages(
)
else:
system_content += (
"\nUSE LOCATOR TOOLS:\n"
"1. FOR TEXT INPUT: input_text(element_index, text) - select from numbered list\n"
"2. FOR CLICKING: tap_element(index) - select from numbered list\n"
"3. OTHER: scroll_down(), swipe_left/right/up(), long_press(index), hide_keyboard(), go_back()\n"
"\n🎯 TOOL SELECTION RULES:\n"
"1. IF element is VISIBLE in the UI list → USE tap_element(index) to click it\n"
"2. IF you need to type text → USE input_text(index, text)\n"
"3. IF target element is NOT in the list → USE scroll_down/swipe_up to reveal it\n"
"4. NEVER use scroll/swipe when the target element is already visible!\n"
"5. scroll_down, swipe_up, swipe_left, swipe_right are ONLY for navigation - NOT for clicking!\n"
"6. To click ANY element from the list, ALWAYS use tap_element(index)\n"
"\nCRITICAL NOTES:\n"
"- The screenshot shows NUMBERED bounding boxes. Use what you SEE in the image!\n"
"- tap_element() clicks by COORDINATES - you CAN tap ANY visible element, even if not marked as clickable\n"
"- If you see the target element on screen, CLICK IT directly with tap_element()\n"
"- Search suggestions, list items, buttons = ALL require tap_element()\n"
)

system_content += (
"\nIMPORTANT: You are working with MOBILE apps (Android/iOS), NOT web browsers."
)
else:
system_content = (
"You are a WEB test automation engine.\n"
"Your job: analyze the instruction and call the appropriate function to interact with the web page.\n"
)

if element_source == "vision":
system_content += (
"\nUSE VISUAL TOOLS:\n"
"- click_visual_element(description): Click by visual description\n"
"- input_text_visual(description, text): Input text by visual description\n"
"- hover_visual(description): Hover by visual description\n"
"- double_click_visual(description): Double click by visual description\n"
"- Elements were detected using computer vision (OmniParser)\n"
)
else:
system_content += (
"\nUSE LOCATOR TOOLS:\n"
"1. FOR TEXT INPUT: input_text(index, text) for <input> or <textarea> elements\n"
"2. FOR CLICKING: click_element(index) for <button> or <a> elements\n"
"3. FOR DROPDOWN: select_option(index, value) for <select> elements\n"
"4. OTHER: scroll_down(), scroll_up(), press_key(), go_back(), hover(), double_click()\n"
)

system_content += (
"\nCRITICAL: Pay attention to element tags when using standard tools:\n"
"- <input> or <textarea> = text input fields (use input_text tool)\n"
"- <button> or <a> = clickable elements (use click_element tool)\n"
"- <select> = dropdown (use select_option tool)\n"
)
# else:
# system_content = (
# "You are a WEB test automation engine.\n"
# "Your job: analyze the instruction and call the appropriate function to interact with the web page.\n"
# )
#
# if element_source == "vision":
# system_content += (
# "\nUSE VISUAL TOOLS:\n"
# "- click_visual_element(description): Click by visual description\n"
# "- input_text_visual(description, text): Input text by visual description\n"
# "- hover_visual(description): Hover by visual description\n"
# "- double_click_visual(description): Double click by visual description\n"
# "- Elements were detected using computer vision (OmniParser)\n"
# )
# else:
# system_content += (
# "\nUSE LOCATOR TOOLS:\n"
# "1. FOR TEXT INPUT: input_text(index, text) for <input> or <textarea> elements\n"
# "2. FOR CLICKING: click_element(index) for <button> or <a> elements\n"
# "3. FOR DROPDOWN: select_option(index, value) for <select> elements\n"
# "4. OTHER: scroll_down(), scroll_up(), press_key(), go_back(), hover(), double_click()\n"
# )
#
# system_content += (
# "\nCRITICAL: Pay attention to element tags when using standard tools:\n"
# "- <input> or <textarea> = text input fields (use input_text tool)\n"
# "- <button> or <a> = clickable elements (use click_element tool)\n"
# "- <select> = dropdown (use select_option tool)\n"
# )

# Build user content based on llm_input_format
ui_label = "Mobile UI Elements" if is_mobile else "Web Elements"
# ui_label = "Mobile UI Elements" if is_mobile else "Web Elements"
ui_label = "Mobile UI Elements"

if llm_input_format == "som" and ui_elements:
source_info = "detected via computer vision" if element_source == "vision" else "from accessibility tree"

legend_lines = []
for idx, elem in enumerate(ui_elements, start=1):
text = elem.get("text", "").replace("\n", " ").strip()[:40]
tag = elem.get("class_name", "")
short_tag = tag.split('.')[-1] if '.' in tag else tag
desc = text if text else (elem.get("aria_label") or elem.get("content_desc") or elem.get("placeholder") or "")
bbox = elem.get("bbox", {})
pos_info = ""
if bbox:
y = bbox.get("y", 0)
x = bbox.get("x", 0)
pos = "top" if y < 400 else "mid" if y < 1200 else "bot"
side = "L" if x < 300 else "C" if x < 700 else "R"
pos_info = f" @{pos}-{side}"
legend_lines.append(f"[{idx}] {short_tag}: {desc}{pos_info}".strip())
legend_text = "\n".join(legend_lines)
# Get screen dimensions
screen_size = self.platform.get_screen_size()
screen_width = screen_size['width']
screen_height = screen_size['height']

text_content = (
f"Instruction: {instruction}\n\n"
f"ANNOTATED SCREENSHOT: Each UI element has a GREEN BOX with its ID NUMBER in a small rectangle at the top-left.\n"
f"ELEMENT LIST ({source_info}):\n{legend_text}\n\n"
f"IMPORTANT: Select the element by its ID NUMBER that best matches the instruction."
)
# Default SoM config
if som_config is None:
som_config = {
'visual_annotation': True,
'text_format': 'compact',
'output_type': 'text'
}

# Use SomComposer to generate SoM components
som_composer = SomComposer(platform, screen_width, screen_height)

# Use pre-annotated image from OmniParser if available (Visual + SoM)
if annotated_image_path:
with open(annotated_image_path, "rb") as img_file:
annotated_base64 = base64.b64encode(img_file.read()).decode("utf-8")
self._save_annotated_image(annotated_base64, source="omniparser")

# Generate text legend using SomComposer
som_result = som_composer.compose(
screenshot_base64=None,
elements=ui_elements,
config={**som_config, 'visual_annotation': False}
)

if som_config.get('output_type') == 'json':
legend_text = som_result.get('elements_json', '')
else:
legend_text = som_result.get('text_legend', '')

text_content = (
f"Instruction: {instruction}\n\n"
f"ANNOTATED SCREENSHOT: Each UI element has a GREEN BOX with its ID NUMBER in a small rectangle at the top-left.\n"
f"ELEMENT LIST ({source_info}):\n{legend_text}\n\n"
f"IMPORTANT: Select the element by its ID NUMBER that best matches the instruction."
)

user_content = [
{"type": "text", "text": text_content},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{annotated_base64}"}}
]
# Otherwise render SoM for DOM elements (DOM + SoM)
elif screenshot_base64:
from Agent.platforms.collectors.som_renderer import render_som
annotated_screenshot = render_som(screenshot_base64, ui_elements)
self._save_annotated_image(annotated_screenshot, source="dom")
user_content = [
{"type": "text", "text": text_content},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{annotated_screenshot}"}}
]
som_result = som_composer.compose(
screenshot_base64=screenshot_base64,
elements=ui_elements,
config=som_config
)

annotated_screenshot = som_result.get('annotated_image_base64', '')

if som_config.get('output_type') == 'json':
legend_text = som_result.get('elements_json', '')
else:
legend_text = som_result.get('text_legend', '')

if annotated_screenshot:
self._save_annotated_image(annotated_screenshot, source="dom")

text_content = (
f"Instruction: {instruction}\n\n"
f"ANNOTATED SCREENSHOT: Each UI element has a GREEN BOX with its ID NUMBER in a small rectangle at the top-left.\n"
f"ELEMENT LIST ({source_info}):\n{legend_text}\n\n"
f"IMPORTANT: Select the element by its ID NUMBER that best matches the instruction."
)

if annotated_screenshot:
user_content = [
{"type": "text", "text": text_content},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{annotated_screenshot}"}}
]
else:
user_content = text_content
else:
user_content = f"Instruction: {instruction}\n\nError: SoM mode requires screenshot"
else:
Expand Down
5 changes: 1 addition & 4 deletions Agent/ai/prompts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
from Agent.ai.prompts.renderer import UIRenderer

__all__ = ["UIRenderer"]

# TODO: prompt templates
123 changes: 0 additions & 123 deletions Agent/ai/prompts/renderer.py

This file was deleted.

Loading
Loading