From becbd94011d0fdc37543a6b096f959ef0025f579 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 13:18:56 +0100 Subject: [PATCH 1/5] added logging reason for choosing a specific tool --- Agent/tools/mobile/click_element.py | 10 ++++++++-- Agent/tools/mobile/go_back.py | 9 ++++++++- Agent/tools/mobile/hide_keyboard.py | 9 ++++++++- Agent/tools/mobile/input_text.py | 20 +++++++++++++------- Agent/tools/mobile/long_press.py | 6 ++++++ Agent/tools/mobile/scroll_down.py | 11 +++++++++-- Agent/tools/mobile/swipe_left.py | 11 +++++++++-- Agent/tools/mobile/swipe_right.py | 11 +++++++++-- Agent/tools/mobile/swipe_up.py | 11 +++++++++-- 9 files changed, 79 insertions(+), 19 deletions(-) diff --git a/Agent/tools/mobile/click_element.py b/Agent/tools/mobile/click_element.py index c5048f3..6cf4feb 100644 --- a/Agent/tools/mobile/click_element.py +++ b/Agent/tools/mobile/click_element.py @@ -15,11 +15,11 @@ class ClickElementTool(BaseTool): @property def name(self) -> str: - return "tap_element" + return "click_element" @property def description(self) -> str: - return "Tap/click element by INDEX. DO NOT use for text input - use input_text instead." + return "CLICK/TAP on ANY visible element (buttons, links, suggestions, icons, list items). USE THIS for all clicking actions." @property def category(self) -> ToolCategory: @@ -45,6 +45,10 @@ def get_parameters_schema(self) -> Dict[str, Any]: "type": "integer", "description": "The index number of the element from the UI elements list (1-based)", "minimum": 1 + }, + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this element and action" } }, "required": ["element_index"] @@ -57,6 +61,7 @@ def execute( context: Dict[str, Any] ) -> None: element_index = arguments["element_index"] + reasoning = arguments.get("reasoning", "No reasoning provided") ui_candidates = context.get("ui_candidates", []) if element_index < 1 or element_index > len(ui_candidates): @@ -67,6 +72,7 @@ def execute( element = ui_candidates[element_index - 1] x, y = get_element_center(element) + logger.info(f"🧠 AI reasoning: {reasoning}") logger.debug(f"Tapping at ({x}, {y}) for element: {element.get('text', '')}") executor.run_keyword("Tap", [x, y]) diff --git a/Agent/tools/mobile/go_back.py b/Agent/tools/mobile/go_back.py index 1a81e1e..90f2538 100644 --- a/Agent/tools/mobile/go_back.py +++ b/Agent/tools/mobile/go_back.py @@ -32,7 +32,12 @@ def works_on_coordinates(self) -> bool: def get_parameters_schema(self) -> Dict[str, Any]: return { "type": "object", - "properties": {}, + "properties": { + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this action" + } + }, "required": [] } @@ -42,5 +47,7 @@ def execute( arguments: Dict[str, Any], context: Dict[str, Any] ) -> None: + reasoning = arguments.get("reasoning", "No reasoning provided") + logger.info(f"🧠 AI reasoning: {reasoning}") executor.run_keyword("Go Back") diff --git a/Agent/tools/mobile/hide_keyboard.py b/Agent/tools/mobile/hide_keyboard.py index fffc99c..4575e91 100644 --- a/Agent/tools/mobile/hide_keyboard.py +++ b/Agent/tools/mobile/hide_keyboard.py @@ -33,7 +33,12 @@ def works_on_coordinates(self) -> bool: def get_parameters_schema(self) -> Dict[str, Any]: return { "type": "object", - "properties": {}, + "properties": { + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this action" + } + }, "required": [] } @@ -43,6 +48,8 @@ def execute( arguments: Dict[str, Any], context: Dict[str, Any] ) -> None: + reasoning = arguments.get("reasoning", "No reasoning provided") + logger.info(f"🧠 AI reasoning: {reasoning}") # Hide Keyboard without arguments for iOS/Android compatibility executor.run_keyword("Hide Keyboard") diff --git a/Agent/tools/mobile/input_text.py b/Agent/tools/mobile/input_text.py index c6d9679..caad5ac 100644 --- a/Agent/tools/mobile/input_text.py +++ b/Agent/tools/mobile/input_text.py @@ -13,7 +13,7 @@ def name(self) -> str: @property def description(self) -> str: - return "USE THIS when instruction contains 'input', 'type', 'enter', 'fill', 'write', 'saisir', 'taper' or mentions entering text. Types text into a text field." + return "USE THIS ONLY when instruction explicitly mentions entering TEXT: 'input', 'type', 'enter', 'fill', 'write', 'saisir', 'taper'. Types text into a text field. DO NOT use this tool to click or tap - use tap_element for that." @property def category(self) -> ToolCategory: @@ -39,6 +39,10 @@ def get_parameters_schema(self) -> Dict[str, Any]: "text": { "type": "string", "description": "The text to input into the element" + }, + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this element and action" } }, "required": ["element_index", "text"] @@ -50,21 +54,23 @@ def execute( arguments: Dict[str, Any], context: Dict[str, Any] ) -> None: - element_index = arguments["element_index"] - text = arguments["text"] + element_index = arguments.get("element_index") + text = arguments.get("text") + reasoning = arguments.get("reasoning", "No reasoning provided") ui_candidates = context.get("ui_candidates", []) - if element_index < 1 or element_index > len(ui_candidates): + if not text: + raise AssertionError("'input_text' requires text argument. Use tap_element to click without entering text.") + + if element_index is None or element_index < 1 or element_index > len(ui_candidates): raise AssertionError( f"Invalid element_index: {element_index}. Must be 1-{len(ui_candidates)}" ) - if not text: - raise AssertionError("'input_text' requires text argument") - element = ui_candidates[element_index - 1] x, y = get_element_center(element) + logger.info(f"🧠 AI reasoning: {reasoning}") logger.debug(f"Tapping at ({x}, {y}) to focus, then input: '{text}'") executor.run_keyword("Tap", [x, y]) executor.run_keyword("Sleep", "1s") diff --git a/Agent/tools/mobile/long_press.py b/Agent/tools/mobile/long_press.py index d3bb67c..5478635 100644 --- a/Agent/tools/mobile/long_press.py +++ b/Agent/tools/mobile/long_press.py @@ -35,6 +35,10 @@ def get_parameters_schema(self) -> Dict[str, Any]: "type": "integer", "description": "The index number of the element from the UI elements list (1-based)", "minimum": 1 + }, + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this element and action" } }, "required": ["element_index"] @@ -47,6 +51,7 @@ def execute( context: Dict[str, Any] ) -> None: element_index = arguments["element_index"] + reasoning = arguments.get("reasoning", "No reasoning provided") ui_candidates = context.get("ui_candidates", []) if element_index < 1 or element_index > len(ui_candidates): @@ -57,6 +62,7 @@ def execute( element = ui_candidates[element_index - 1] x, y = get_element_center(element) + logger.info(f"🧠 AI reasoning: {reasoning}") logger.debug(f"Long pressing at ({x}, {y}) for 2s") executor.run_keyword("Tap", [x, y], 1, "2s") diff --git a/Agent/tools/mobile/scroll_down.py b/Agent/tools/mobile/scroll_down.py index 618b67d..0829c01 100644 --- a/Agent/tools/mobile/scroll_down.py +++ b/Agent/tools/mobile/scroll_down.py @@ -12,7 +12,7 @@ def name(self) -> str: @property def description(self) -> str: - return "Scroll down the mobile screen" + return "NAVIGATION ONLY: Scroll content DOWN to reveal elements BELOW. NOT for clicking visible elements - use tap_element to click." @property def category(self) -> ToolCategory: @@ -29,7 +29,12 @@ def works_on_coordinates(self) -> bool: def get_parameters_schema(self) -> Dict[str, Any]: return { "type": "object", - "properties": {}, + "properties": { + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this action" + } + }, "required": [] } @@ -39,5 +44,7 @@ def execute( arguments: Dict[str, Any], context: Dict[str, Any] ) -> None: + reasoning = arguments.get("reasoning", "No reasoning provided") + logger.info(f"🧠 AI reasoning: {reasoning}") executor.run_keyword("Swipe By Percent", 50, 80, 50, 20, "1s") diff --git a/Agent/tools/mobile/swipe_left.py b/Agent/tools/mobile/swipe_left.py index 0ecb460..1931173 100644 --- a/Agent/tools/mobile/swipe_left.py +++ b/Agent/tools/mobile/swipe_left.py @@ -15,7 +15,7 @@ def name(self) -> str: @property def description(self) -> str: - return "Swipe left on the mobile screen (for carousel, tabs, horizontal scrolling)" + return "USE THIS ONLY for horizontal navigation: carousels, image galleries, tabs. Do NOT use to click on visible elements - use tap_element instead." @property def category(self) -> ToolCategory: @@ -32,7 +32,12 @@ def works_on_coordinates(self) -> bool: def get_parameters_schema(self) -> Dict[str, Any]: return { "type": "object", - "properties": {}, + "properties": { + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this action" + } + }, "required": [] } @@ -42,6 +47,8 @@ def execute( arguments: Dict[str, Any], context: Dict[str, Any] ) -> None: + reasoning = arguments.get("reasoning", "No reasoning provided") + logger.info(f"🧠 AI reasoning: {reasoning}") # Swipe from right (80%) to left (20%) horizontally, middle of screen vertically executor.run_keyword("Swipe By Percent", 80, 50, 20, 50, "1s") diff --git a/Agent/tools/mobile/swipe_right.py b/Agent/tools/mobile/swipe_right.py index 8692619..c5a63b1 100644 --- a/Agent/tools/mobile/swipe_right.py +++ b/Agent/tools/mobile/swipe_right.py @@ -15,7 +15,7 @@ def name(self) -> str: @property def description(self) -> str: - return "Swipe right on the mobile screen (for carousel, tabs, horizontal scrolling)" + return "USE THIS ONLY for horizontal navigation: carousels, image galleries, tabs. Do NOT use to click on visible elements - use tap_element instead." @property def category(self) -> ToolCategory: @@ -32,7 +32,12 @@ def works_on_coordinates(self) -> bool: def get_parameters_schema(self) -> Dict[str, Any]: return { "type": "object", - "properties": {}, + "properties": { + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this action" + } + }, "required": [] } @@ -42,6 +47,8 @@ def execute( arguments: Dict[str, Any], context: Dict[str, Any] ) -> None: + reasoning = arguments.get("reasoning", "No reasoning provided") + logger.info(f"🧠 AI reasoning: {reasoning}") # Swipe from left (20%) to right (80%) horizontally, middle of screen vertically executor.run_keyword("Swipe By Percent", 20, 50, 80, 50, "1s") diff --git a/Agent/tools/mobile/swipe_up.py b/Agent/tools/mobile/swipe_up.py index 95df12b..7d40e6c 100644 --- a/Agent/tools/mobile/swipe_up.py +++ b/Agent/tools/mobile/swipe_up.py @@ -12,7 +12,7 @@ def name(self) -> str: @property def description(self) -> str: - return "Scroll content UP (reveal content above)" + return "NAVIGATION ONLY: Scroll content UP to reveal elements ABOVE. NOT for clicking visible elements - use tap_element to click." @property def category(self) -> ToolCategory: @@ -29,7 +29,12 @@ def works_on_coordinates(self) -> bool: def get_parameters_schema(self) -> Dict[str, Any]: return { "type": "object", - "properties": {}, + "properties": { + "reasoning": { + "type": "string", + "description": "Brief explanation (1 sentence) of WHY you chose this action" + } + }, "required": [] } @@ -39,6 +44,8 @@ def execute( arguments: Dict[str, Any], context: Dict[str, Any] ) -> None: + reasoning = arguments.get("reasoning", "No reasoning provided") + logger.info(f"🧠 AI reasoning: {reasoning}") # Swipe from top (20%) to bottom (80%) vertically - scrolls content UP executor.run_keyword("Swipe By Percent", 50, 20, 50, 80, "1s") From 825737c9a0b4a91fa00256a4c7a7f82979d6517d Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 13:19:13 +0100 Subject: [PATCH 2/5] ask moved as a screen tools --- Agent/tools/screen/__init__.py | 8 ++ Agent/tools/screen/answer_json.py | 52 +++++++++++ Agent/tools/screen/answer_text.py | 50 ++++++++++ Agent/tools/screen/assert_screen.py | 139 ++++++++++++++++++++++++++++ 4 files changed, 249 insertions(+) create mode 100644 Agent/tools/screen/__init__.py create mode 100644 Agent/tools/screen/answer_json.py create mode 100644 Agent/tools/screen/answer_text.py create mode 100644 Agent/tools/screen/assert_screen.py diff --git a/Agent/tools/screen/__init__.py b/Agent/tools/screen/__init__.py new file mode 100644 index 0000000..0ac52af --- /dev/null +++ b/Agent/tools/screen/__init__.py @@ -0,0 +1,8 @@ +from Agent.tools.screen.answer_text import AnswerTextTool +from Agent.tools.screen.answer_json import AnswerJsonTool +from Agent.tools.screen.assert_screen import AssertScreenTool + +SCREEN_TOOLS = [AnswerTextTool, AnswerJsonTool, AssertScreenTool] + +__all__ = ["SCREEN_TOOLS", "AnswerTextTool", "AnswerJsonTool", "AssertScreenTool"] + diff --git a/Agent/tools/screen/answer_json.py b/Agent/tools/screen/answer_json.py new file mode 100644 index 0000000..f4e7361 --- /dev/null +++ b/Agent/tools/screen/answer_json.py @@ -0,0 +1,52 @@ +from typing import Any, Dict +import json +from Agent.tools.base import BaseTool, ExecutorProtocol, ToolCategory +from robot.api import logger + + +class AnswerJsonTool(BaseTool): + """Answer question about the screen with JSON response.""" + + @property + def name(self) -> str: + return "answer_question_json" + + @property + def description(self) -> str: + return "Provide a JSON object answer to the question about the screen content" + + @property + def category(self) -> ToolCategory: + return ToolCategory.SCREEN + + @property + def works_on_locator(self) -> bool: + return False + + @property + def works_on_coordinates(self) -> bool: + return False + + def get_parameters_schema(self) -> Dict[str, Any]: + return { + "type": "object", + "properties": { + "answer": { + "type": "object", + "description": "The JSON object answer to the question based on what you see in the screenshot" + } + }, + "required": ["answer"] + } + + def execute( + self, + executor: ExecutorProtocol, + arguments: Dict[str, Any], + context: Dict[str, Any] + ) -> str: + answer = arguments.get("answer", {}) + answer_str = json.dumps(answer, ensure_ascii=False) + logger.info(f"💬 AI Answer (JSON): {answer_str[:100]}..." if len(answer_str) > 100 else f"💬 AI Answer (JSON): {answer_str}") + return answer_str + diff --git a/Agent/tools/screen/answer_text.py b/Agent/tools/screen/answer_text.py new file mode 100644 index 0000000..21dd151 --- /dev/null +++ b/Agent/tools/screen/answer_text.py @@ -0,0 +1,50 @@ +from typing import Any, Dict +from Agent.tools.base import BaseTool, ExecutorProtocol, ToolCategory +from robot.api import logger + + +class AnswerTextTool(BaseTool): + """Answer question about the screen with text response.""" + + @property + def name(self) -> str: + return "answer_question" + + @property + def description(self) -> str: + return "Provide a text answer to the question about the screen content" + + @property + def category(self) -> ToolCategory: + return ToolCategory.SCREEN + + @property + def works_on_locator(self) -> bool: + return False + + @property + def works_on_coordinates(self) -> bool: + return False + + def get_parameters_schema(self) -> Dict[str, Any]: + return { + "type": "object", + "properties": { + "answer": { + "type": "string", + "description": "The text answer to the question based on what you see in the screenshot" + } + }, + "required": ["answer"] + } + + def execute( + self, + executor: ExecutorProtocol, + arguments: Dict[str, Any], + context: Dict[str, Any] + ) -> str: + answer = arguments.get("answer", "") + logger.info(f"💬 AI Answer: {answer[:100]}..." if len(answer) > 100 else f"💬 AI Answer: {answer}") + return answer + diff --git a/Agent/tools/screen/assert_screen.py b/Agent/tools/screen/assert_screen.py new file mode 100644 index 0000000..5a8f2fb --- /dev/null +++ b/Agent/tools/screen/assert_screen.py @@ -0,0 +1,139 @@ +from typing import Any, Dict +from Agent.tools.base import BaseTool, ExecutorProtocol, ToolCategory +from robot.api import logger + + +class AssertScreenTool(BaseTool): + """Screen assertion tool - analyzes screenshots to verify conditions. + + This tool is used by Agent.VisualCheck to verify UI states, presence of elements, + visual appearance, etc. by analyzing screenshots with AI vision models. + """ + + @property + def name(self) -> str: + return "assert_screen" + + @property + def description(self) -> str: + return "Report the results of visual verification against the given instruction" + + @property + def category(self) -> ToolCategory: + return ToolCategory.SCREEN + + def get_parameters_schema(self) -> Dict[str, Any]: + return { + "type": "object", + "properties": { + "verification_result": { + "type": "boolean", + "description": "Whether the screenshot matches the instruction (true) or not (false)" + }, + "confidence_score": { + "type": "number", + "description": "Confidence level of the verification from 0.0 (no confidence) to 1.0 (completely confident)", + "minimum": 0.0, + "maximum": 1.0 + }, + "analysis": { + "type": "string", + "description": "Detailed analysis explaining why the verification passed or failed" + }, + "found_elements": { + "type": "array", + "description": "Optional list of UI elements found in the screenshot", + "items": { + "type": "object", + "properties": { + "element_type": {"type": "string"}, + "description": {"type": "string"}, + "location": {"type": "string"}, + "confidence": {"type": "number"} + } + } + }, + "issues": { + "type": "array", + "description": "Optional list of issues or problems found", + "items": {"type": "string"} + } + }, + "required": ["verification_result", "confidence_score", "analysis"] + } + + def execute( + self, + executor: ExecutorProtocol, + arguments: Dict[str, Any], + context: Dict[str, Any] + ) -> None: + """Execute visual verification - log results and assert if failed. + + Note: Screen tools don't use the executor for actions, they analyze results. + """ + verification_result = arguments.get("verification_result") + confidence_score = arguments.get("confidence_score") + analysis = arguments.get("analysis") + found_elements = arguments.get("found_elements", []) + issues = arguments.get("issues", []) + + min_confidence = context.get("min_confidence", 0.7) + + logger.info(f"👁️ Visual verification results: {arguments}") + + logger.debug("=" * 80) + logger.debug("AI VISUAL VERIFICATION RESPONSE") + logger.debug("=" * 80) + logger.debug(f"Verification Result: {'PASS' if verification_result else 'FAIL'}") + logger.debug(f"Confidence Score: {confidence_score:.2f}") + logger.debug(f"Analysis: {analysis}") + + if found_elements: + logger.debug(f"Found Elements ({len(found_elements)} total):") + for i, element in enumerate(found_elements[:10], 1): + element_type = element.get("element_type", "unknown") + description = element.get("description", "no description") + location = element.get("location", "unknown location") + confidence = element.get("confidence", 0.0) + logger.debug(f" {i}. {element_type}: {description}") + logger.debug(f" Location: {location}") + logger.debug(f" Confidence: {confidence:.2f}") + + if issues: + logger.debug(f"Issues Found ({len(issues)} total):") + for i, issue in enumerate(issues, 1): + logger.debug(f" {i}. {issue}") + + logger.debug("=" * 80) + + logger.debug(f"🔍 Verification result: {verification_result}") + logger.debug(f"📊 Confidence score: {confidence_score}") + logger.debug(f"📝 Analysis: {analysis}") + + if found_elements: + logger.debug(f"🎯 Found elements: {len(found_elements)} elements detected") + for i, element in enumerate(found_elements[:5], 1): + element_type = element.get("element_type", "unknown") + description = element.get("description", "no description") + confidence = element.get("confidence", 0.0) + logger.debug(f" {i}. {element_type}: {description} (confidence: {confidence:.2f})") + + if issues: + logger.debug(f"⚠️ Issues found: {len(issues)} issues detected") + for i, issue in enumerate(issues[:3], 1): + logger.debug(f" {i}. {issue}") + + if not verification_result: + error_msg = f"Visual verification failed. Analysis: {analysis}" + if issues: + error_msg += f" Issues: {', '.join(issues[:3])}" + raise AssertionError(error_msg) + elif confidence_score < min_confidence: + raise AssertionError( + f"Confidence score too low: {confidence_score:.2f} < {min_confidence} " + f"(threshold). Analysis: {analysis}" + ) + else: + logger.info(f"✅ Visual verification passed (confidence: {confidence_score:.2f})") + From a903ef7534b0069a0b78e36551b5e381cb6db15c Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 13:19:52 +0100 Subject: [PATCH 3/5] adaption and selection one screen tool programatically to reduce hallucination --- Agent/agent_engine.py | 102 +++++++++------------- Agent/ai/_promptcomposer.py | 30 +++---- Agent/tools/base.py | 4 +- Agent/tools/registry.py | 46 +++++++++- Agent/tools/visual/__init__.py | 10 --- Agent/tools/visual/verify_match.py | 135 ----------------------------- 6 files changed, 99 insertions(+), 228 deletions(-) delete mode 100644 Agent/tools/visual/__init__.py delete mode 100644 Agent/tools/visual/verify_match.py diff --git a/Agent/agent_engine.py b/Agent/agent_engine.py index dbba091..aaaf58b 100644 --- a/Agent/agent_engine.py +++ b/Agent/agent_engine.py @@ -8,7 +8,9 @@ from Agent.tools.base import ToolCategory from Agent.core.keyword_runner import KeywordRunner from Agent.tools.mobile import MOBILE_TOOLS -from Agent.tools.visual import VISUAL_TOOLS +from Agent.tools.screen.answer_text import AnswerTextTool +from Agent.tools.screen.answer_json import AnswerJsonTool +from Agent.tools.screen.assert_screen import AssertScreenTool from robot.api import logger @@ -38,13 +40,16 @@ def __init__( self.executor = KeywordRunner(self.platform) self._register_mobile_tools() - self._register_visual_tools() self.prompt_composer = AgentPromptComposer( tool_registry=self.tool_registry, platform_connector=self.platform ) + self.tool_registry.register(AnswerTextTool()) + self.tool_registry.register(AnswerJsonTool()) + self.tool_registry.register(AssertScreenTool()) + self.element_source = element_source self.llm_input_format = llm_input_format logger.info(f"🎯 Element source: {element_source}, LLM input format: {llm_input_format}") @@ -55,12 +60,6 @@ def _register_mobile_tools(self) -> None: mobile_tools_count = len(self.tool_registry.get_by_category(ToolCategory.MOBILE)) logger.debug(f"📱 Registered {mobile_tools_count} mobile tools") - def _register_visual_tools(self) -> None: - for ToolClass in VISUAL_TOOLS: - self.tool_registry.register(ToolClass()) - visual_tools_count = len(self.tool_registry.get_by_category(ToolCategory.VISUAL)) - logger.debug(f"👁️ Registered {visual_tools_count} visual tools") - # ----------------------- Public API ----------------------- def set_element_source(self, source: str) -> None: @@ -203,44 +202,44 @@ def do(self, instruction: str) -> None: self._execute_do_from_tool_calls(result, context, instruction) logger.info("Agent.Do completed") - def visual_check(self, instruction: str) -> None: + def visual_check(self, instruction: str, min_confidence: float = 0.7) -> None: """Execute visual verification based on natural language instruction. Args: instruction: Natural language verification instruction (e.g., "verify the home screen is displayed") + min_confidence: Minimum confidence score required (0.0-1.0, default 0.7) """ - logger.info(f"👁️ Starting Agent.VisualCheck: '{instruction}'") + logger.info(f"👁️ Starting Agent.VisualCheck: '{instruction}' (min_confidence={min_confidence})") if hasattr(self.platform, 'wait_for_page_stable'): self.platform.wait_for_page_stable() screenshot_base64 = self.platform.get_screenshot_base64() - - # Embed screenshot to Robot Framework log self.platform.embed_image_to_log(screenshot_base64) - logger.debug("Screenshot captured and sent to AI for analysis") + image_url = self.image_uploader.upload_from_base64(screenshot_base64) - - # Prepare AI request - messages = self.prompt_composer.compose_visual_check_messages(instruction, image_url) - tools = self.prompt_composer.get_visual_check_tools() - logger.debug(f"Visual check tools: {len(tools)} tools") - if not tools: - raise RuntimeError("No visual tools registered. Check tool registration.") + tool = self.tool_registry.get_tool_for_query("visual_check") + if not tool: + raise AssertionError("visual_check tool not found") + + messages = self.prompt_composer.compose_visual_check_messages(instruction, image_url) - # Call AI result = self.llm.send_ai_request_with_tools( messages=messages, - tools=tools, + tools=[tool.to_tool_spec()], tool_choice="required", temperature=0 ) - logger.debug("Executing visual verification...") - self._execute_visual_check_from_tool_calls(result) - logger.debug("Agent.VisualCheck completed successfully") + tool_call = result.get("tool_calls", [{}])[0] + arguments = tool_call.get("function", {}).get("arguments", {}) + + context = {"min_confidence": min_confidence} + tool.execute(self.executor, arguments, context) + + logger.info("Agent.VisualCheck completed") def ask(self, question: str, response_format: str = "text") -> str: """Ask AI a question about the current screen. @@ -252,8 +251,7 @@ def ask(self, question: str, response_format: str = "text") -> str: Returns: AI response as string (or JSON string if format=json) """ - import json - logger.info(f"❓ Agent.Ask: '{question}'") + logger.info(f"❓ Starting Agent.Ask: '{question}' (format: {response_format})") if hasattr(self.platform, 'wait_for_page_stable'): self.platform.wait_for_page_stable() @@ -261,18 +259,25 @@ def ask(self, question: str, response_format: str = "text") -> str: screenshot_base64 = self.platform.get_screenshot_base64() self.platform.embed_image_to_log(screenshot_base64) - messages = self.prompt_composer.compose_ask_messages( - question, screenshot_base64, response_format + tool = self.tool_registry.get_tool_for_query("ask", response_format=response_format) + if not tool: + raise AssertionError(f"No tool found for response_format: {response_format}") + + messages = self.prompt_composer.compose_ask_messages(question, screenshot_base64, response_format) + + result = self.llm.send_ai_request_with_tools( + messages=messages, + tools=[tool.to_tool_spec()], + tool_choice="required", + temperature=0 ) - if response_format == "json": - response_dict = self.llm.send_ai_request_and_return_response(messages=messages, temperature=0) - response = json.dumps(response_dict, ensure_ascii=False) - else: - response = self.llm.send_ai_request(messages=messages, temperature=0) + tool_call = result.get("tool_calls", [{}])[0] + arguments = tool_call.get("function", {}).get("arguments", {}) - logger.info(f"💬 Response: {response[:100]}..." if len(response) > 100 else f"💬 Response: {response}") - return response + answer = tool.execute(self.executor, arguments, {}) + logger.info("Agent.Ask completed") + return answer def find_visual_element(self, description: str, format: str = "center") -> Dict[str, Any]: """Find element visually using OmniParser and return bbox. @@ -358,29 +363,4 @@ def _execute_do_from_tool_calls( # Execute the tool tool.execute(self.executor, arguments, context) - def _execute_visual_check_from_tool_calls(self, result: Dict[str, Any]) -> None: - """Execute visual check from tool calls returned by the LLM using the tool registry.""" - tool_calls = result.get("tool_calls", []) - - if not tool_calls: - logger.error("No tool calls in visual check response") - raise AssertionError("AI did not return any tool calls for visual verification") - - # Extract the first tool call (typically verify_visual_match) - tool_call = tool_calls[0] - function_name = tool_call["function"]["name"] - arguments = tool_call["function"]["arguments"] - - logger.debug(f"⚙️ Executing visual tool: {function_name}") - - # Get tool from registry - tool = self.tool_registry.get(function_name) - if not tool: - raise AssertionError(f"Unknown visual tool: {function_name}") - - # Prepare context for tool execution (visual tools don't need ui_candidates) - context = {} - - # Execute the visual tool (will handle logging and assertions) - tool.execute(self.executor, arguments, context) diff --git a/Agent/ai/_promptcomposer.py b/Agent/ai/_promptcomposer.py index df3324c..dd6d261 100644 --- a/Agent/ai/_promptcomposer.py +++ b/Agent/ai/_promptcomposer.py @@ -202,7 +202,7 @@ def compose_visual_check_messages( system_content = ( "You are a mobile app visual verification engine. " "Analyze the screenshot and verify if it matches the instruction. " - "Use the verify_visual_match function to report your findings." + "Use the assert_screen function to report your findings." ) user_content = [ {"type": "text", "text": f"Verify: {instruction}"}, @@ -214,32 +214,24 @@ def compose_visual_check_messages( {"role": "user", "content": user_content} ] - def get_visual_check_tools(self) -> List[Dict[str, Any]]: - """Return tool definitions for visual check actions from the registry. - - Returns tool specs in standard format (works with OpenAI, Anthropic, Gemini, etc.) - """ - return self.registry.get_tool_specs(category=ToolCategory.VISUAL) def compose_ask_messages( self, question: str, screenshot_base64: str, - response_format: str = "text", + response_format: str = "text" ) -> List[Dict[str, Any]]: - """Build messages for asking AI about current screen.""" + """Build messages for asking AI about current screen using tool calling.""" if response_format == "json": - system_content = ( - "You are a screen analysis assistant. " - "Answer questions about what you see in the screenshot. " - "IMPORTANT: Always respond with valid JSON only, no markdown, no explanation outside JSON." - ) + instruction = "Use the answer_question_json function to provide your answer as a JSON object." else: - system_content = ( - "You are a screen analysis assistant. " - "Answer questions about what you see in the screenshot. " - "Be concise and direct." - ) + instruction = "Use the answer_question function to provide your answer as text." + + system_content = ( + "You are a screen analysis assistant. " + "Answer questions about what you see in the screenshot. " + f"{instruction}" + ) user_content = [ {"type": "text", "text": question}, diff --git a/Agent/tools/base.py b/Agent/tools/base.py index ef6337f..77cd27c 100644 --- a/Agent/tools/base.py +++ b/Agent/tools/base.py @@ -7,7 +7,7 @@ class ToolCategory(Enum): MOBILE = "mobile" WEB = "web" - VISUAL = "visual" + SCREEN = "screen" class BaseTool(ABC): @@ -35,7 +35,7 @@ def description(self) -> str: @property @abstractmethod def category(self) -> ToolCategory: - """Tool category: ToolCategory.MOBILE, ToolCategory.WEB, or ToolCategory.VISUAL.""" + """Tool category: ToolCategory.MOBILE, ToolCategory.WEB, or ToolCategory.SCREEN.""" pass @property diff --git a/Agent/tools/registry.py b/Agent/tools/registry.py index b6230e8..ba7284e 100644 --- a/Agent/tools/registry.py +++ b/Agent/tools/registry.py @@ -8,10 +8,21 @@ class ToolRegistry: Tools can be registered dynamically and retrieved by: - name - - category (mobile, web, visual) + - category (mobile, web, screen) - all tools + - action-based selection (filters tools by parameters) """ + SCREEN_TOOL_SELECTION_RULES = { + "ask": { + "text": "answer_question", + "json": "answer_question_json" + }, + "visual_check": { + "default": "assert_screen" + } + } + _instance: Optional['ToolRegistry'] = None _tools: Dict[str, BaseTool] = {} @@ -86,6 +97,39 @@ def get_tools_for_source(self, category: Union[ToolCategory, str], element_sourc if not (tool.works_on_coordinates and not tool.works_on_locator) ] + def get_tool_for_query(self, query_type: str, **kwargs) -> Optional[BaseTool]: + """Get single tool based on query type and parameters. + + Args: + query_type: Query type (e.g., "ask", "visual_check") + **kwargs: Parameters to filter tool (e.g., response_format="json") + + Returns: + Single tool matching criteria or None if not found + + Example: + tool = registry.get_tool_for_query("ask", response_format="json") + tool = registry.get_tool_for_query("visual_check") + """ + rules = self.SCREEN_TOOL_SELECTION_RULES.get(query_type) + if not rules: + logger.warn(f"No selection rules found for query: {query_type}") + return None + + if query_type == "ask": + format_key = kwargs.get("response_format", "text") + tool_name = rules.get(format_key) + elif query_type == "visual_check": + tool_name = rules.get("default") + else: + return None + + if not tool_name: + logger.warn(f"No tool found for query: {query_type} with kwargs: {kwargs}") + return None + + return self.get(tool_name) + def clear(self) -> None: self._tools.clear() diff --git a/Agent/tools/visual/__init__.py b/Agent/tools/visual/__init__.py deleted file mode 100644 index 00bca38..0000000 --- a/Agent/tools/visual/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from Agent.tools.visual.verify_match import VerifyVisualMatchTool - - -# Declarative list of visual tools for registration -VISUAL_TOOLS = [ - VerifyVisualMatchTool, -] - -__all__ = ["VISUAL_TOOLS", "VerifyVisualMatchTool"] - diff --git a/Agent/tools/visual/verify_match.py b/Agent/tools/visual/verify_match.py deleted file mode 100644 index abc41c6..0000000 --- a/Agent/tools/visual/verify_match.py +++ /dev/null @@ -1,135 +0,0 @@ -from typing import Any, Dict -from Agent.tools.base import BaseTool, ExecutorProtocol, ToolCategory -from robot.api import logger - - -class VerifyVisualMatchTool(BaseTool): - """Visual verification tool - analyzes screenshots to verify conditions. - - This tool is used by Agent.VisualCheck to verify UI states, presence of elements, - visual appearance, etc. by analyzing screenshots with AI vision models. - """ - - @property - def name(self) -> str: - return "verify_visual_match" - - @property - def description(self) -> str: - return "Report the results of visual verification against the given instruction" - - @property - def category(self) -> ToolCategory: - return ToolCategory.VISUAL - - def get_parameters_schema(self) -> Dict[str, Any]: - return { - "type": "object", - "properties": { - "verification_result": { - "type": "boolean", - "description": "Whether the screenshot matches the instruction (true) or not (false)" - }, - "confidence_score": { - "type": "number", - "description": "Confidence level of the verification from 0.0 (no confidence) to 1.0 (completely confident)", - "minimum": 0.0, - "maximum": 1.0 - }, - "analysis": { - "type": "string", - "description": "Detailed analysis explaining why the verification passed or failed" - }, - "found_elements": { - "type": "array", - "description": "Optional list of UI elements found in the screenshot", - "items": { - "type": "object", - "properties": { - "element_type": {"type": "string"}, - "description": {"type": "string"}, - "location": {"type": "string"}, - "confidence": {"type": "number"} - } - } - }, - "issues": { - "type": "array", - "description": "Optional list of issues or problems found", - "items": {"type": "string"} - } - }, - "required": ["verification_result", "confidence_score", "analysis"] - } - - def execute( - self, - executor: ExecutorProtocol, - arguments: Dict[str, Any], - context: Dict[str, Any] - ) -> None: - """Execute visual verification - log results and assert if failed. - - Note: Visual tools don't use the executor for actions, they analyze results. - """ - verification_result = arguments.get("verification_result") - confidence_score = arguments.get("confidence_score") - analysis = arguments.get("analysis") - found_elements = arguments.get("found_elements", []) - issues = arguments.get("issues", []) - - logger.info(f"👁️ Visual verification results: {arguments}") - - # Log detailed AI response - logger.debug("=" * 80) - logger.debug("AI VISUAL VERIFICATION RESPONSE") - logger.debug("=" * 80) - logger.debug(f"Verification Result: {'PASS' if verification_result else 'FAIL'}") - logger.debug(f"Confidence Score: {confidence_score:.2f}") - logger.debug(f"Analysis: {analysis}") - - if found_elements: - logger.debug(f"Found Elements ({len(found_elements)} total):") - for i, element in enumerate(found_elements[:10], 1): - element_type = element.get("element_type", "unknown") - description = element.get("description", "no description") - location = element.get("location", "unknown location") - confidence = element.get("confidence", 0.0) - logger.debug(f" {i}. {element_type}: {description}") - logger.debug(f" Location: {location}") - logger.debug(f" Confidence: {confidence:.2f}") - - if issues: - logger.debug(f"Issues Found ({len(issues)} total):") - for i, issue in enumerate(issues, 1): - logger.debug(f" {i}. {issue}") - - logger.debug("=" * 80) - - # Compact log for custom logger - logger.debug(f"🔍 Verification result: {verification_result}") - logger.debug(f"📊 Confidence score: {confidence_score}") - logger.debug(f"📝 Analysis: {analysis}") - - if found_elements: - logger.debug(f"🎯 Found elements: {len(found_elements)} elements detected") - for i, element in enumerate(found_elements[:5], 1): - element_type = element.get("element_type", "unknown") - description = element.get("description", "no description") - confidence = element.get("confidence", 0.0) - logger.debug(f" {i}. {element_type}: {description} (confidence: {confidence:.2f})") - - if issues: - logger.debug(f"⚠️ Issues found: {len(issues)} issues detected") - for i, issue in enumerate(issues[:3], 1): - logger.debug(f" {i}. {issue}") - - # Assert based on verification result - if verification_result: - logger.info("✅ Visual verification passed") - else: - error_msg = f"Visual verification failed. Analysis: {analysis}" - if issues: - error_msg += f" Issues: {', '.join(issues[:3])}" - raise AssertionError(error_msg) - From 923f6e0db4ca6135538f76575f7587252e4a3a78 Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 13:45:56 +0100 Subject: [PATCH 4/5] improve logging on assert screen tool --- Agent/tools/screen/assert_screen.py | 51 ++++------------------------- 1 file changed, 7 insertions(+), 44 deletions(-) diff --git a/Agent/tools/screen/assert_screen.py b/Agent/tools/screen/assert_screen.py index 5a8f2fb..715e88e 100644 --- a/Agent/tools/screen/assert_screen.py +++ b/Agent/tools/screen/assert_screen.py @@ -80,60 +80,23 @@ def execute( min_confidence = context.get("min_confidence", 0.7) - logger.info(f"👁️ Visual verification results: {arguments}") - - logger.debug("=" * 80) - logger.debug("AI VISUAL VERIFICATION RESPONSE") - logger.debug("=" * 80) - logger.debug(f"Verification Result: {'PASS' if verification_result else 'FAIL'}") - logger.debug(f"Confidence Score: {confidence_score:.2f}") + result_status = "PASS" if verification_result else "FAIL" + logger.info(f"Visual check: {result_status} (confidence: {confidence_score:.2f})") logger.debug(f"Analysis: {analysis}") if found_elements: - logger.debug(f"Found Elements ({len(found_elements)} total):") - for i, element in enumerate(found_elements[:10], 1): - element_type = element.get("element_type", "unknown") - description = element.get("description", "no description") - location = element.get("location", "unknown location") - confidence = element.get("confidence", 0.0) - logger.debug(f" {i}. {element_type}: {description}") - logger.debug(f" Location: {location}") - logger.debug(f" Confidence: {confidence:.2f}") - - if issues: - logger.debug(f"Issues Found ({len(issues)} total):") - for i, issue in enumerate(issues, 1): - logger.debug(f" {i}. {issue}") - - logger.debug("=" * 80) - - logger.debug(f"🔍 Verification result: {verification_result}") - logger.debug(f"📊 Confidence score: {confidence_score}") - logger.debug(f"📝 Analysis: {analysis}") - - if found_elements: - logger.debug(f"🎯 Found elements: {len(found_elements)} elements detected") - for i, element in enumerate(found_elements[:5], 1): - element_type = element.get("element_type", "unknown") - description = element.get("description", "no description") - confidence = element.get("confidence", 0.0) - logger.debug(f" {i}. {element_type}: {description} (confidence: {confidence:.2f})") + logger.debug(f"Found elements: {found_elements}") if issues: - logger.debug(f"⚠️ Issues found: {len(issues)} issues detected") - for i, issue in enumerate(issues[:3], 1): - logger.debug(f" {i}. {issue}") + logger.debug(f"Issues: {', '.join(issues[:3])}") if not verification_result: - error_msg = f"Visual verification failed. Analysis: {analysis}" + error_msg = f"Visual verification failed: {analysis}" if issues: - error_msg += f" Issues: {', '.join(issues[:3])}" + error_msg += f" | Issues: {', '.join(issues[:3])}" raise AssertionError(error_msg) elif confidence_score < min_confidence: raise AssertionError( - f"Confidence score too low: {confidence_score:.2f} < {min_confidence} " - f"(threshold). Analysis: {analysis}" + f"Confidence too low: {confidence_score:.2f} < {min_confidence}. {analysis}" ) - else: - logger.info(f"✅ Visual verification passed (confidence: {confidence_score:.2f})") From ef23457d1d1ceb5698c000d6cd0f57abfb0db7dc Mon Sep 17 00:00:00 2001 From: hassineabd Date: Tue, 30 Dec 2025 13:46:18 +0100 Subject: [PATCH 5/5] improve click element tool description --- Agent/tools/mobile/click_element.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Agent/tools/mobile/click_element.py b/Agent/tools/mobile/click_element.py index 6cf4feb..348f05b 100644 --- a/Agent/tools/mobile/click_element.py +++ b/Agent/tools/mobile/click_element.py @@ -19,7 +19,7 @@ def name(self) -> str: @property def description(self) -> str: - return "CLICK/TAP on ANY visible element (buttons, links, suggestions, icons, list items). USE THIS for all clicking actions." + return "CLICK/TAP on visible elements. PREFER elements that contains/englobes CLEAR TEXT/LABELS over icons when possible. Choose the most explicit element (e.g., text suggestions, labeled buttons) rather than ambiguous icons." @property def category(self) -> ToolCategory: