diff --git a/Agent/agent_engine.py b/Agent/agent_engine.py
index dbba091..0983dcf 100644
--- a/Agent/agent_engine.py
+++ b/Agent/agent_engine.py
@@ -15,6 +15,12 @@
class AgentEngine:
"""Core engine for AI-driven Android test automation."""
+ SOM_CONFIG = {
+ 'visual_annotation': True,
+ 'text_format': 'compact',
+ 'output_type': 'text'
+ }
+
def __init__(
self,
llm_client: str = "openai",
@@ -176,6 +182,7 @@ def do(self, instruction: str) -> None:
llm_input_format=self.llm_input_format,
screenshot_base64=screenshot_base64,
annotated_image_path=annotated_image_path,
+ som_config=self.SOM_CONFIG if self.llm_input_format == "som" else None,
)
if annotated_image_path:
logger.info(f"Annotated image: {annotated_image_path}")
diff --git a/Agent/ai/_promptcomposer.py b/Agent/ai/_promptcomposer.py
index df3324c..2cb6132 100644
--- a/Agent/ai/_promptcomposer.py
+++ b/Agent/ai/_promptcomposer.py
@@ -1,6 +1,7 @@
from typing import List, Dict, Optional, Any
from Agent.tools.registry import ToolRegistry
from Agent.tools.base import ToolCategory
+from Agent.platforms.grounding import SomComposer
from robot.api import logger
import base64
import os
@@ -46,6 +47,7 @@ def compose_do_messages(
llm_input_format: str = "text",
screenshot_base64: Optional[str] = None,
annotated_image_path: Optional[str] = None,
+ som_config: Optional[Dict[str, Any]] = None,
) -> List[Dict[str, Any]]:
"""Build DO action messages using tool calling approach.
@@ -57,6 +59,12 @@ def compose_do_messages(
llm_input_format: 'text' or 'som'
screenshot_base64: Screenshot (required for SoM mode)
annotated_image_path: Pre-annotated image from OmniParser
+ som_config: SoM configuration dict {
+ 'visual_annotation': True/False,
+ 'text_format': 'compact'/'detailed'/'minimal',
+ 'output_type': 'text'/'json',
+ 'include_screenshot': True/False
+ }
"""
# Base system prompt
is_mobile = platform in ("android", "ios")
@@ -64,6 +72,14 @@ def compose_do_messages(
system_content = (
"You are a MOBILE app test automation engine (Appium).\n"
"Your job: analyze the instruction and call the appropriate function to interact with the mobile UI.\n"
+ "\n⚠️ CRITICAL TOOL SELECTION:\n"
+ "- IF instruction says 'click', 'tap', 'select', 'choose' → ALWAYS use tap_element(index)\n"
+ "- scroll/swipe tools are ONLY for navigation - NEVER use them to click/tap\n"
+ "\n⚠️ IMPORTANT:\n"
+ "ALL tools have a 'reasoning' parameter. You MUST provide a brief explanation (1 sentence) of:\n"
+ "- Which element you chose and why (for element-based actions)\n"
+ "- Why this action matches the instruction (for all actions)\n"
+ "Example: {\"element_index\": 5, \"reasoning\": \"Clicking the search icon at the top right to open search\"}\n"
)
if element_source == "vision":
@@ -75,94 +91,138 @@ def compose_do_messages(
)
else:
system_content += (
- "\nUSE LOCATOR TOOLS:\n"
- "1. FOR TEXT INPUT: input_text(element_index, text) - select from numbered list\n"
- "2. FOR CLICKING: tap_element(index) - select from numbered list\n"
- "3. OTHER: scroll_down(), swipe_left/right/up(), long_press(index), hide_keyboard(), go_back()\n"
+ "\n🎯 TOOL SELECTION RULES:\n"
+ "1. IF element is VISIBLE in the UI list → USE tap_element(index) to click it\n"
+ "2. IF you need to type text → USE input_text(index, text)\n"
+ "3. IF target element is NOT in the list → USE scroll_down/swipe_up to reveal it\n"
+ "4. NEVER use scroll/swipe when the target element is already visible!\n"
+ "5. scroll_down, swipe_up, swipe_left, swipe_right are ONLY for navigation - NOT for clicking!\n"
+ "6. To click ANY element from the list, ALWAYS use tap_element(index)\n"
+ "\nCRITICAL NOTES:\n"
+ "- The screenshot shows NUMBERED bounding boxes. Use what you SEE in the image!\n"
+ "- tap_element() clicks by COORDINATES - you CAN tap ANY visible element, even if not marked as clickable\n"
+ "- If you see the target element on screen, CLICK IT directly with tap_element()\n"
+ "- Search suggestions, list items, buttons = ALL require tap_element()\n"
)
system_content += (
"\nIMPORTANT: You are working with MOBILE apps (Android/iOS), NOT web browsers."
)
- else:
- system_content = (
- "You are a WEB test automation engine.\n"
- "Your job: analyze the instruction and call the appropriate function to interact with the web page.\n"
- )
-
- if element_source == "vision":
- system_content += (
- "\nUSE VISUAL TOOLS:\n"
- "- click_visual_element(description): Click by visual description\n"
- "- input_text_visual(description, text): Input text by visual description\n"
- "- hover_visual(description): Hover by visual description\n"
- "- double_click_visual(description): Double click by visual description\n"
- "- Elements were detected using computer vision (OmniParser)\n"
- )
- else:
- system_content += (
- "\nUSE LOCATOR TOOLS:\n"
- "1. FOR TEXT INPUT: input_text(index, text) for or