aidriventesting · hassineabd · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/Agent/agent_engine.py b/Agent/agent_engine.py
@@ -8,7 +8,9 @@
 from Agent.tools.base import ToolCategory
 from Agent.core.keyword_runner import KeywordRunner
 from Agent.tools.mobile import MOBILE_TOOLS
-from Agent.tools.visual import VISUAL_TOOLS
+from Agent.tools.screen.answer_text import AnswerTextTool
+from Agent.tools.screen.answer_json import AnswerJsonTool
+from Agent.tools.screen.assert_screen import AssertScreenTool
 from robot.api import logger
 
 
@@ -38,13 +40,16 @@ def __init__(
         self.executor = KeywordRunner(self.platform)
 
         self._register_mobile_tools()
-        self._register_visual_tools()
 
         self.prompt_composer = AgentPromptComposer(
             tool_registry=self.tool_registry,
             platform_connector=self.platform
         )
 
+        self.tool_registry.register(AnswerTextTool())
+        self.tool_registry.register(AnswerJsonTool())
+        self.tool_registry.register(AssertScreenTool())
+
         self.element_source = element_source
         self.llm_input_format = llm_input_format
         logger.info(f"🎯 Element source: {element_source}, LLM input format: {llm_input_format}")
@@ -55,12 +60,6 @@ def _register_mobile_tools(self) -> None:
         mobile_tools_count = len(self.tool_registry.get_by_category(ToolCategory.MOBILE))
         logger.debug(f"📱 Registered {mobile_tools_count} mobile tools")
 
-    def _register_visual_tools(self) -> None:
-        for ToolClass in VISUAL_TOOLS:
-            self.tool_registry.register(ToolClass())
-        visual_tools_count = len(self.tool_registry.get_by_category(ToolCategory.VISUAL))
-        logger.debug(f"👁️ Registered {visual_tools_count} visual tools")
-
     # ----------------------- Public API -----------------------
 
     def set_element_source(self, source: str) -> None:
@@ -203,44 +202,44 @@ def do(self, instruction: str) -> None:
         self._execute_do_from_tool_calls(result, context, instruction)
         logger.info("Agent.Do completed")
 
-    def visual_check(self, instruction: str) -> None:
+    def visual_check(self, instruction: str, min_confidence: float = 0.7) -> None:
         """Execute visual verification based on natural language instruction.
 
         Args:
             instruction: Natural language verification instruction 
                         (e.g., "verify the home screen is displayed")
+            min_confidence: Minimum confidence score required (0.0-1.0, default 0.7)
         """
-        logger.info(f"👁️ Starting Agent.VisualCheck: '{instruction}'")
+        logger.info(f"👁️ Starting Agent.VisualCheck: '{instruction}' (min_confidence={min_confidence})")
 
         if hasattr(self.platform, 'wait_for_page_stable'):
             self.platform.wait_for_page_stable()
 
         screenshot_base64 = self.platform.get_screenshot_base64()
-
-        # Embed screenshot to Robot Framework log
         self.platform.embed_image_to_log(screenshot_base64)
-        logger.debug("Screenshot captured and sent to AI for analysis")
+
         image_url = self.image_uploader.upload_from_base64(screenshot_base64)
-
-        # Prepare AI request
-        messages = self.prompt_composer.compose_visual_check_messages(instruction, image_url)
-        tools = self.prompt_composer.get_visual_check_tools()
-        logger.debug(f"Visual check tools: {len(tools)} tools")
 
-        if not tools:
-            raise RuntimeError("No visual tools registered. Check tool registration.")
+        tool = self.tool_registry.get_tool_for_query("visual_check")
+        if not tool:
+            raise AssertionError("visual_check tool not found")
+
+        messages = self.prompt_composer.compose_visual_check_messages(instruction, image_url)
 
-        # Call AI
         result = self.llm.send_ai_request_with_tools(
             messages=messages,
-            tools=tools,
+            tools=[tool.to_tool_spec()],
             tool_choice="required",
             temperature=0
         )
 
-        logger.debug("Executing visual verification...")
-        self._execute_visual_check_from_tool_calls(result)
-        logger.debug("Agent.VisualCheck completed successfully")
+        tool_call = result.get("tool_calls", [{}])[0]
+        arguments = tool_call.get("function", {}).get("arguments", {})
+
+        context = {"min_confidence": min_confidence}
+        tool.execute(self.executor, arguments, context)
+
+        logger.info("Agent.VisualCheck completed")
 
     def ask(self, question: str, response_format: str = "text") -> str:
         """Ask AI a question about the current screen.
@@ -252,27 +251,33 @@ def ask(self, question: str, response_format: str = "text") -> str:
         Returns:
             AI response as string (or JSON string if format=json)
         """
-        import json
-        logger.info(f"❓ Agent.Ask: '{question}'")
+        logger.info(f"❓ Starting Agent.Ask: '{question}' (format: {response_format})")
 
         if hasattr(self.platform, 'wait_for_page_stable'):
             self.platform.wait_for_page_stable()
 
         screenshot_base64 = self.platform.get_screenshot_base64()
         self.platform.embed_image_to_log(screenshot_base64)
 
-        messages = self.prompt_composer.compose_ask_messages(
-            question, screenshot_base64, response_format
+        tool = self.tool_registry.get_tool_for_query("ask", response_format=response_format)
+        if not tool:
+            raise AssertionError(f"No tool found for response_format: {response_format}")
+
+        messages = self.prompt_composer.compose_ask_messages(question, screenshot_base64, response_format)
+
+        result = self.llm.send_ai_request_with_tools(
+            messages=messages,
+            tools=[tool.to_tool_spec()],
+            tool_choice="required",
+            temperature=0
         )
 
-        if response_format == "json":
-            response_dict = self.llm.send_ai_request_and_return_response(messages=messages, temperature=0)
-            response = json.dumps(response_dict, ensure_ascii=False)
-        else:
-            response = self.llm.send_ai_request(messages=messages, temperature=0)
+        tool_call = result.get("tool_calls", [{}])[0]
+        arguments = tool_call.get("function", {}).get("arguments", {})
 
-        logger.info(f"💬 Response: {response[:100]}..." if len(response) > 100 else f"💬 Response: {response}")
-        return response
+        answer = tool.execute(self.executor, arguments, {})
+        logger.info("Agent.Ask completed")
+        return answer
 
     def find_visual_element(self, description: str, format: str = "center") -> Dict[str, Any]:
         """Find element visually using OmniParser and return bbox.
@@ -358,29 +363,4 @@ def _execute_do_from_tool_calls(
         # Execute the tool
         tool.execute(self.executor, arguments, context)
 
-    def _execute_visual_check_from_tool_calls(self, result: Dict[str, Any]) -> None:
-        """Execute visual check from tool calls returned by the LLM using the tool registry."""
-        tool_calls = result.get("tool_calls", [])
-
-        if not tool_calls:
-            logger.error("No tool calls in visual check response")
-            raise AssertionError("AI did not return any tool calls for visual verification")
-
-        # Extract the first tool call (typically verify_visual_match)
-        tool_call = tool_calls[0]
-        function_name = tool_call["function"]["name"]
-        arguments = tool_call["function"]["arguments"]
-
-        logger.debug(f"⚙️ Executing visual tool: {function_name}")
-
-        # Get tool from registry
-        tool = self.tool_registry.get(function_name)
-        if not tool:
-            raise AssertionError(f"Unknown visual tool: {function_name}")
-
-        # Prepare context for tool execution (visual tools don't need ui_candidates)
-        context = {}
-
-        # Execute the visual tool (will handle logging and assertions)
-        tool.execute(self.executor, arguments, context)
 
diff --git a/Agent/ai/_promptcomposer.py b/Agent/ai/_promptcomposer.py
@@ -202,7 +202,7 @@ def compose_visual_check_messages(
         system_content = (
             "You are a mobile app visual verification engine. "
             "Analyze the screenshot and verify if it matches the instruction. "
-            "Use the verify_visual_match function to report your findings."
+            "Use the assert_screen function to report your findings."
         )
         user_content = [
             {"type": "text", "text": f"Verify: {instruction}"},
@@ -214,32 +214,24 @@ def compose_visual_check_messages(
             {"role": "user", "content": user_content}
         ]
 
-    def get_visual_check_tools(self) -> List[Dict[str, Any]]:
-        """Return tool definitions for visual check actions from the registry.
-
-        Returns tool specs in standard format (works with OpenAI, Anthropic, Gemini, etc.)
-        """
-        return self.registry.get_tool_specs(category=ToolCategory.VISUAL)
 
     def compose_ask_messages(
         self,
         question: str,
         screenshot_base64: str,
-        response_format: str = "text",
+        response_format: str = "text"
     ) -> List[Dict[str, Any]]:
-        """Build messages for asking AI about current screen."""
+        """Build messages for asking AI about current screen using tool calling."""
         if response_format == "json":
-            system_content = (
-                "You are a screen analysis assistant. "
-                "Answer questions about what you see in the screenshot. "
-                "IMPORTANT: Always respond with valid JSON only, no markdown, no explanation outside JSON."
-            )
+            instruction = "Use the answer_question_json function to provide your answer as a JSON object."
         else:
-            system_content = (
-                "You are a screen analysis assistant. "
-                "Answer questions about what you see in the screenshot. "
-                "Be concise and direct."
-            )
+            instruction = "Use the answer_question function to provide your answer as text."
+
+        system_content = (
+            "You are a screen analysis assistant. "
+            "Answer questions about what you see in the screenshot. "
+            f"{instruction}"
+        )
 
         user_content = [
             {"type": "text", "text": question},

diff --git a/Agent/tools/base.py b/Agent/tools/base.py
@@ -7,7 +7,7 @@
 class ToolCategory(Enum):
     MOBILE = "mobile"
     WEB = "web"
-    VISUAL = "visual"
+    SCREEN = "screen"
 
 
 class BaseTool(ABC):
@@ -35,7 +35,7 @@ def description(self) -> str:
     @property
     @abstractmethod
     def category(self) -> ToolCategory:
-        """Tool category: ToolCategory.MOBILE, ToolCategory.WEB, or ToolCategory.VISUAL."""
+        """Tool category: ToolCategory.MOBILE, ToolCategory.WEB, or ToolCategory.SCREEN."""
         pass
 
     @property

diff --git a/Agent/tools/mobile/click_element.py b/Agent/tools/mobile/click_element.py
@@ -15,11 +15,11 @@ class ClickElementTool(BaseTool):
 
     @property
     def name(self) -> str:
-        return "tap_element"
+        return "click_element"
 
     @property
     def description(self) -> str:
-        return "Tap/click element by INDEX. DO NOT use for text input - use input_text instead."
+        return "CLICK/TAP on visible elements. PREFER elements that contains/englobes CLEAR TEXT/LABELS over icons when possible. Choose the most explicit element (e.g., text suggestions, labeled buttons) rather than ambiguous icons."
 
     @property
     def category(self) -> ToolCategory:
@@ -45,6 +45,10 @@ def get_parameters_schema(self) -> Dict[str, Any]:
                     "type": "integer",
                     "description": "The index number of the element from the UI elements list (1-based)",
                     "minimum": 1
+                },
+                "reasoning": {
+                    "type": "string",
+                    "description": "Brief explanation (1 sentence) of WHY you chose this element and action"
                 }
             },
             "required": ["element_index"]
@@ -57,6 +61,7 @@ def execute(
         context: Dict[str, Any]
     ) -> None:
         element_index = arguments["element_index"]
+        reasoning = arguments.get("reasoning", "No reasoning provided")
         ui_candidates = context.get("ui_candidates", [])
 
         if element_index < 1 or element_index > len(ui_candidates):
@@ -67,6 +72,7 @@ def execute(
         element = ui_candidates[element_index - 1]
         x, y = get_element_center(element)
 
+        logger.info(f"🧠 AI reasoning: {reasoning}")
         logger.debug(f"Tapping at ({x}, {y}) for element: {element.get('text', '')}")
         executor.run_keyword("Tap", [x, y])
 
diff --git a/Agent/tools/mobile/go_back.py b/Agent/tools/mobile/go_back.py
@@ -32,7 +32,12 @@ def works_on_coordinates(self) -> bool:
     def get_parameters_schema(self) -> Dict[str, Any]:
         return {
             "type": "object",
-            "properties": {},
+            "properties": {
+                "reasoning": {
+                    "type": "string",
+                    "description": "Brief explanation (1 sentence) of WHY you chose this action"
+                }
+            },
             "required": []
         }
 
@@ -42,5 +47,7 @@ def execute(
         arguments: Dict[str, Any], 
         context: Dict[str, Any]
     ) -> None:
+        reasoning = arguments.get("reasoning", "No reasoning provided")
+        logger.info(f"🧠 AI reasoning: {reasoning}")
         executor.run_keyword("Go Back")
 
diff --git a/Agent/tools/mobile/hide_keyboard.py b/Agent/tools/mobile/hide_keyboard.py
@@ -33,7 +33,12 @@ def works_on_coordinates(self) -> bool:
     def get_parameters_schema(self) -> Dict[str, Any]:
         return {
             "type": "object",
-            "properties": {},
+            "properties": {
+                "reasoning": {
+                    "type": "string",
+                    "description": "Brief explanation (1 sentence) of WHY you chose this action"
+                }
+            },
             "required": []
         }
 
@@ -43,6 +48,8 @@ def execute(
         arguments: Dict[str, Any], 
         context: Dict[str, Any]
     ) -> None:
+        reasoning = arguments.get("reasoning", "No reasoning provided")
+        logger.info(f"🧠 AI reasoning: {reasoning}")
         # Hide Keyboard without arguments for iOS/Android compatibility
         executor.run_keyword("Hide Keyboard")
 
diff --git a/Agent/tools/mobile/input_text.py b/Agent/tools/mobile/input_text.py
@@ -13,7 +13,7 @@ def name(self) -> str:
 
     @property
     def description(self) -> str:
-        return "USE THIS when instruction contains 'input', 'type', 'enter', 'fill', 'write', 'saisir', 'taper' or mentions entering text. Types text into a text field."
+        return "USE THIS ONLY when instruction explicitly mentions entering TEXT: 'input', 'type', 'enter', 'fill', 'write', 'saisir', 'taper'. Types text into a text field. DO NOT use this tool to click or tap - use tap_element for that."
 
     @property
     def category(self) -> ToolCategory:
@@ -39,6 +39,10 @@ def get_parameters_schema(self) -> Dict[str, Any]:
                 "text": {
                     "type": "string",
                     "description": "The text to input into the element"
+                },
+                "reasoning": {
+                    "type": "string",
+                    "description": "Brief explanation (1 sentence) of WHY you chose this element and action"
                 }
             },
             "required": ["element_index", "text"]
@@ -50,21 +54,23 @@ def execute(
         arguments: Dict[str, Any], 
         context: Dict[str, Any]
     ) -> None:
-        element_index = arguments["element_index"]
-        text = arguments["text"]
+        element_index = arguments.get("element_index")
+        text = arguments.get("text")
+        reasoning = arguments.get("reasoning", "No reasoning provided")
         ui_candidates = context.get("ui_candidates", [])
 
-        if element_index < 1 or element_index > len(ui_candidates):
+        if not text:
+            raise AssertionError("'input_text' requires text argument. Use tap_element to click without entering text.")
+
+        if element_index is None or element_index < 1 or element_index > len(ui_candidates):
             raise AssertionError(
                 f"Invalid element_index: {element_index}. Must be 1-{len(ui_candidates)}"
             )
 
-        if not text:
-            raise AssertionError("'input_text' requires text argument")
-
         element = ui_candidates[element_index - 1]
         x, y = get_element_center(element)
 
+        logger.info(f"🧠 AI reasoning: {reasoning}")
         logger.debug(f"Tapping at ({x}, {y}) to focus, then input: '{text}'")
         executor.run_keyword("Tap", [x, y])
         executor.run_keyword("Sleep", "1s")