diff --git a/Agent/agent_engine.py b/Agent/agent_engine.py index dbba091..0983dcf 100644 --- a/Agent/agent_engine.py +++ b/Agent/agent_engine.py @@ -15,6 +15,12 @@ class AgentEngine: """Core engine for AI-driven Android test automation.""" + SOM_CONFIG = { + 'visual_annotation': True, + 'text_format': 'compact', + 'output_type': 'text' + } + def __init__( self, llm_client: str = "openai", @@ -176,6 +182,7 @@ def do(self, instruction: str) -> None: llm_input_format=self.llm_input_format, screenshot_base64=screenshot_base64, annotated_image_path=annotated_image_path, + som_config=self.SOM_CONFIG if self.llm_input_format == "som" else None, ) if annotated_image_path: logger.info(f"Annotated image: {annotated_image_path}") diff --git a/Agent/ai/_promptcomposer.py b/Agent/ai/_promptcomposer.py index df3324c..2cb6132 100644 --- a/Agent/ai/_promptcomposer.py +++ b/Agent/ai/_promptcomposer.py @@ -1,6 +1,7 @@ from typing import List, Dict, Optional, Any from Agent.tools.registry import ToolRegistry from Agent.tools.base import ToolCategory +from Agent.platforms.grounding import SomComposer from robot.api import logger import base64 import os @@ -46,6 +47,7 @@ def compose_do_messages( llm_input_format: str = "text", screenshot_base64: Optional[str] = None, annotated_image_path: Optional[str] = None, + som_config: Optional[Dict[str, Any]] = None, ) -> List[Dict[str, Any]]: """Build DO action messages using tool calling approach. @@ -57,6 +59,12 @@ def compose_do_messages( llm_input_format: 'text' or 'som' screenshot_base64: Screenshot (required for SoM mode) annotated_image_path: Pre-annotated image from OmniParser + som_config: SoM configuration dict { + 'visual_annotation': True/False, + 'text_format': 'compact'/'detailed'/'minimal', + 'output_type': 'text'/'json', + 'include_screenshot': True/False + } """ # Base system prompt is_mobile = platform in ("android", "ios") @@ -64,6 +72,14 @@ def compose_do_messages( system_content = ( "You are a MOBILE app test automation engine (Appium).\n" "Your job: analyze the instruction and call the appropriate function to interact with the mobile UI.\n" + "\n⚠️ CRITICAL TOOL SELECTION:\n" + "- IF instruction says 'click', 'tap', 'select', 'choose' → ALWAYS use tap_element(index)\n" + "- scroll/swipe tools are ONLY for navigation - NEVER use them to click/tap\n" + "\n⚠️ IMPORTANT:\n" + "ALL tools have a 'reasoning' parameter. You MUST provide a brief explanation (1 sentence) of:\n" + "- Which element you chose and why (for element-based actions)\n" + "- Why this action matches the instruction (for all actions)\n" + "Example: {\"element_index\": 5, \"reasoning\": \"Clicking the search icon at the top right to open search\"}\n" ) if element_source == "vision": @@ -75,94 +91,138 @@ def compose_do_messages( ) else: system_content += ( - "\nUSE LOCATOR TOOLS:\n" - "1. FOR TEXT INPUT: input_text(element_index, text) - select from numbered list\n" - "2. FOR CLICKING: tap_element(index) - select from numbered list\n" - "3. OTHER: scroll_down(), swipe_left/right/up(), long_press(index), hide_keyboard(), go_back()\n" + "\n🎯 TOOL SELECTION RULES:\n" + "1. IF element is VISIBLE in the UI list → USE tap_element(index) to click it\n" + "2. IF you need to type text → USE input_text(index, text)\n" + "3. IF target element is NOT in the list → USE scroll_down/swipe_up to reveal it\n" + "4. NEVER use scroll/swipe when the target element is already visible!\n" + "5. scroll_down, swipe_up, swipe_left, swipe_right are ONLY for navigation - NOT for clicking!\n" + "6. To click ANY element from the list, ALWAYS use tap_element(index)\n" + "\nCRITICAL NOTES:\n" + "- The screenshot shows NUMBERED bounding boxes. Use what you SEE in the image!\n" + "- tap_element() clicks by COORDINATES - you CAN tap ANY visible element, even if not marked as clickable\n" + "- If you see the target element on screen, CLICK IT directly with tap_element()\n" + "- Search suggestions, list items, buttons = ALL require tap_element()\n" ) system_content += ( "\nIMPORTANT: You are working with MOBILE apps (Android/iOS), NOT web browsers." ) - else: - system_content = ( - "You are a WEB test automation engine.\n" - "Your job: analyze the instruction and call the appropriate function to interact with the web page.\n" - ) - - if element_source == "vision": - system_content += ( - "\nUSE VISUAL TOOLS:\n" - "- click_visual_element(description): Click by visual description\n" - "- input_text_visual(description, text): Input text by visual description\n" - "- hover_visual(description): Hover by visual description\n" - "- double_click_visual(description): Double click by visual description\n" - "- Elements were detected using computer vision (OmniParser)\n" - ) - else: - system_content += ( - "\nUSE LOCATOR TOOLS:\n" - "1. FOR TEXT INPUT: input_text(index, text) for or