Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 41 additions & 61 deletions Agent/agent_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
from Agent.tools.base import ToolCategory
from Agent.core.keyword_runner import KeywordRunner
from Agent.tools.mobile import MOBILE_TOOLS
from Agent.tools.visual import VISUAL_TOOLS
from Agent.tools.screen.answer_text import AnswerTextTool
from Agent.tools.screen.answer_json import AnswerJsonTool
from Agent.tools.screen.assert_screen import AssertScreenTool
from robot.api import logger


Expand Down Expand Up @@ -38,13 +40,16 @@ def __init__(
self.executor = KeywordRunner(self.platform)

self._register_mobile_tools()
self._register_visual_tools()

self.prompt_composer = AgentPromptComposer(
tool_registry=self.tool_registry,
platform_connector=self.platform
)

self.tool_registry.register(AnswerTextTool())
self.tool_registry.register(AnswerJsonTool())
self.tool_registry.register(AssertScreenTool())

self.element_source = element_source
self.llm_input_format = llm_input_format
logger.info(f"🎯 Element source: {element_source}, LLM input format: {llm_input_format}")
Expand All @@ -55,12 +60,6 @@ def _register_mobile_tools(self) -> None:
mobile_tools_count = len(self.tool_registry.get_by_category(ToolCategory.MOBILE))
logger.debug(f"📱 Registered {mobile_tools_count} mobile tools")

def _register_visual_tools(self) -> None:
for ToolClass in VISUAL_TOOLS:
self.tool_registry.register(ToolClass())
visual_tools_count = len(self.tool_registry.get_by_category(ToolCategory.VISUAL))
logger.debug(f"👁️ Registered {visual_tools_count} visual tools")

# ----------------------- Public API -----------------------

def set_element_source(self, source: str) -> None:
Expand Down Expand Up @@ -203,44 +202,44 @@ def do(self, instruction: str) -> None:
self._execute_do_from_tool_calls(result, context, instruction)
logger.info("Agent.Do completed")

def visual_check(self, instruction: str) -> None:
def visual_check(self, instruction: str, min_confidence: float = 0.7) -> None:
"""Execute visual verification based on natural language instruction.

Args:
instruction: Natural language verification instruction
(e.g., "verify the home screen is displayed")
min_confidence: Minimum confidence score required (0.0-1.0, default 0.7)
"""
logger.info(f"👁️ Starting Agent.VisualCheck: '{instruction}'")
logger.info(f"👁️ Starting Agent.VisualCheck: '{instruction}' (min_confidence={min_confidence})")

if hasattr(self.platform, 'wait_for_page_stable'):
self.platform.wait_for_page_stable()

screenshot_base64 = self.platform.get_screenshot_base64()

# Embed screenshot to Robot Framework log
self.platform.embed_image_to_log(screenshot_base64)
logger.debug("Screenshot captured and sent to AI for analysis")

image_url = self.image_uploader.upload_from_base64(screenshot_base64)

# Prepare AI request
messages = self.prompt_composer.compose_visual_check_messages(instruction, image_url)
tools = self.prompt_composer.get_visual_check_tools()
logger.debug(f"Visual check tools: {len(tools)} tools")

if not tools:
raise RuntimeError("No visual tools registered. Check tool registration.")
tool = self.tool_registry.get_tool_for_query("visual_check")
if not tool:
raise AssertionError("visual_check tool not found")

messages = self.prompt_composer.compose_visual_check_messages(instruction, image_url)

# Call AI
result = self.llm.send_ai_request_with_tools(
messages=messages,
tools=tools,
tools=[tool.to_tool_spec()],
tool_choice="required",
temperature=0
)

logger.debug("Executing visual verification...")
self._execute_visual_check_from_tool_calls(result)
logger.debug("Agent.VisualCheck completed successfully")
tool_call = result.get("tool_calls", [{}])[0]
arguments = tool_call.get("function", {}).get("arguments", {})

context = {"min_confidence": min_confidence}
tool.execute(self.executor, arguments, context)

logger.info("Agent.VisualCheck completed")

def ask(self, question: str, response_format: str = "text") -> str:
"""Ask AI a question about the current screen.
Expand All @@ -252,27 +251,33 @@ def ask(self, question: str, response_format: str = "text") -> str:
Returns:
AI response as string (or JSON string if format=json)
"""
import json
logger.info(f"❓ Agent.Ask: '{question}'")
logger.info(f"❓ Starting Agent.Ask: '{question}' (format: {response_format})")

if hasattr(self.platform, 'wait_for_page_stable'):
self.platform.wait_for_page_stable()

screenshot_base64 = self.platform.get_screenshot_base64()
self.platform.embed_image_to_log(screenshot_base64)

messages = self.prompt_composer.compose_ask_messages(
question, screenshot_base64, response_format
tool = self.tool_registry.get_tool_for_query("ask", response_format=response_format)
if not tool:
raise AssertionError(f"No tool found for response_format: {response_format}")

messages = self.prompt_composer.compose_ask_messages(question, screenshot_base64, response_format)

result = self.llm.send_ai_request_with_tools(
messages=messages,
tools=[tool.to_tool_spec()],
tool_choice="required",
temperature=0
)

if response_format == "json":
response_dict = self.llm.send_ai_request_and_return_response(messages=messages, temperature=0)
response = json.dumps(response_dict, ensure_ascii=False)
else:
response = self.llm.send_ai_request(messages=messages, temperature=0)
tool_call = result.get("tool_calls", [{}])[0]
arguments = tool_call.get("function", {}).get("arguments", {})

logger.info(f"💬 Response: {response[:100]}..." if len(response) > 100 else f"💬 Response: {response}")
return response
answer = tool.execute(self.executor, arguments, {})
logger.info("Agent.Ask completed")
return answer

def find_visual_element(self, description: str, format: str = "center") -> Dict[str, Any]:
"""Find element visually using OmniParser and return bbox.
Expand Down Expand Up @@ -358,29 +363,4 @@ def _execute_do_from_tool_calls(
# Execute the tool
tool.execute(self.executor, arguments, context)

def _execute_visual_check_from_tool_calls(self, result: Dict[str, Any]) -> None:
"""Execute visual check from tool calls returned by the LLM using the tool registry."""
tool_calls = result.get("tool_calls", [])

if not tool_calls:
logger.error("No tool calls in visual check response")
raise AssertionError("AI did not return any tool calls for visual verification")

# Extract the first tool call (typically verify_visual_match)
tool_call = tool_calls[0]
function_name = tool_call["function"]["name"]
arguments = tool_call["function"]["arguments"]

logger.debug(f"⚙️ Executing visual tool: {function_name}")

# Get tool from registry
tool = self.tool_registry.get(function_name)
if not tool:
raise AssertionError(f"Unknown visual tool: {function_name}")

# Prepare context for tool execution (visual tools don't need ui_candidates)
context = {}

# Execute the visual tool (will handle logging and assertions)
tool.execute(self.executor, arguments, context)

30 changes: 11 additions & 19 deletions Agent/ai/_promptcomposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def compose_visual_check_messages(
system_content = (
"You are a mobile app visual verification engine. "
"Analyze the screenshot and verify if it matches the instruction. "
"Use the verify_visual_match function to report your findings."
"Use the assert_screen function to report your findings."
)
user_content = [
{"type": "text", "text": f"Verify: {instruction}"},
Expand All @@ -214,32 +214,24 @@ def compose_visual_check_messages(
{"role": "user", "content": user_content}
]

def get_visual_check_tools(self) -> List[Dict[str, Any]]:
"""Return tool definitions for visual check actions from the registry.

Returns tool specs in standard format (works with OpenAI, Anthropic, Gemini, etc.)
"""
return self.registry.get_tool_specs(category=ToolCategory.VISUAL)

def compose_ask_messages(
self,
question: str,
screenshot_base64: str,
response_format: str = "text",
response_format: str = "text"
) -> List[Dict[str, Any]]:
"""Build messages for asking AI about current screen."""
"""Build messages for asking AI about current screen using tool calling."""
if response_format == "json":
system_content = (
"You are a screen analysis assistant. "
"Answer questions about what you see in the screenshot. "
"IMPORTANT: Always respond with valid JSON only, no markdown, no explanation outside JSON."
)
instruction = "Use the answer_question_json function to provide your answer as a JSON object."
else:
system_content = (
"You are a screen analysis assistant. "
"Answer questions about what you see in the screenshot. "
"Be concise and direct."
)
instruction = "Use the answer_question function to provide your answer as text."

system_content = (
"You are a screen analysis assistant. "
"Answer questions about what you see in the screenshot. "
f"{instruction}"
)

user_content = [
{"type": "text", "text": question},
Expand Down
4 changes: 2 additions & 2 deletions Agent/tools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class ToolCategory(Enum):
MOBILE = "mobile"
WEB = "web"
VISUAL = "visual"
SCREEN = "screen"


class BaseTool(ABC):
Expand Down Expand Up @@ -35,7 +35,7 @@ def description(self) -> str:
@property
@abstractmethod
def category(self) -> ToolCategory:
"""Tool category: ToolCategory.MOBILE, ToolCategory.WEB, or ToolCategory.VISUAL."""
"""Tool category: ToolCategory.MOBILE, ToolCategory.WEB, or ToolCategory.SCREEN."""
pass

@property
Expand Down
10 changes: 8 additions & 2 deletions Agent/tools/mobile/click_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ class ClickElementTool(BaseTool):

@property
def name(self) -> str:
return "tap_element"
return "click_element"

@property
def description(self) -> str:
return "Tap/click element by INDEX. DO NOT use for text input - use input_text instead."
return "CLICK/TAP on visible elements. PREFER elements that contains/englobes CLEAR TEXT/LABELS over icons when possible. Choose the most explicit element (e.g., text suggestions, labeled buttons) rather than ambiguous icons."

@property
def category(self) -> ToolCategory:
Expand All @@ -45,6 +45,10 @@ def get_parameters_schema(self) -> Dict[str, Any]:
"type": "integer",
"description": "The index number of the element from the UI elements list (1-based)",
"minimum": 1
},
"reasoning": {
"type": "string",
"description": "Brief explanation (1 sentence) of WHY you chose this element and action"
}
},
"required": ["element_index"]
Expand All @@ -57,6 +61,7 @@ def execute(
context: Dict[str, Any]
) -> None:
element_index = arguments["element_index"]
reasoning = arguments.get("reasoning", "No reasoning provided")
ui_candidates = context.get("ui_candidates", [])

if element_index < 1 or element_index > len(ui_candidates):
Expand All @@ -67,6 +72,7 @@ def execute(
element = ui_candidates[element_index - 1]
x, y = get_element_center(element)

logger.info(f"🧠 AI reasoning: {reasoning}")
logger.debug(f"Tapping at ({x}, {y}) for element: {element.get('text', '')}")
executor.run_keyword("Tap", [x, y])

9 changes: 8 additions & 1 deletion Agent/tools/mobile/go_back.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ def works_on_coordinates(self) -> bool:
def get_parameters_schema(self) -> Dict[str, Any]:
return {
"type": "object",
"properties": {},
"properties": {
"reasoning": {
"type": "string",
"description": "Brief explanation (1 sentence) of WHY you chose this action"
}
},
"required": []
}

Expand All @@ -42,5 +47,7 @@ def execute(
arguments: Dict[str, Any],
context: Dict[str, Any]
) -> None:
reasoning = arguments.get("reasoning", "No reasoning provided")
logger.info(f"🧠 AI reasoning: {reasoning}")
executor.run_keyword("Go Back")

9 changes: 8 additions & 1 deletion Agent/tools/mobile/hide_keyboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,12 @@ def works_on_coordinates(self) -> bool:
def get_parameters_schema(self) -> Dict[str, Any]:
return {
"type": "object",
"properties": {},
"properties": {
"reasoning": {
"type": "string",
"description": "Brief explanation (1 sentence) of WHY you chose this action"
}
},
"required": []
}

Expand All @@ -43,6 +48,8 @@ def execute(
arguments: Dict[str, Any],
context: Dict[str, Any]
) -> None:
reasoning = arguments.get("reasoning", "No reasoning provided")
logger.info(f"🧠 AI reasoning: {reasoning}")
# Hide Keyboard without arguments for iOS/Android compatibility
executor.run_keyword("Hide Keyboard")

20 changes: 13 additions & 7 deletions Agent/tools/mobile/input_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def name(self) -> str:

@property
def description(self) -> str:
return "USE THIS when instruction contains 'input', 'type', 'enter', 'fill', 'write', 'saisir', 'taper' or mentions entering text. Types text into a text field."
return "USE THIS ONLY when instruction explicitly mentions entering TEXT: 'input', 'type', 'enter', 'fill', 'write', 'saisir', 'taper'. Types text into a text field. DO NOT use this tool to click or tap - use tap_element for that."

@property
def category(self) -> ToolCategory:
Expand All @@ -39,6 +39,10 @@ def get_parameters_schema(self) -> Dict[str, Any]:
"text": {
"type": "string",
"description": "The text to input into the element"
},
"reasoning": {
"type": "string",
"description": "Brief explanation (1 sentence) of WHY you chose this element and action"
}
},
"required": ["element_index", "text"]
Expand All @@ -50,21 +54,23 @@ def execute(
arguments: Dict[str, Any],
context: Dict[str, Any]
) -> None:
element_index = arguments["element_index"]
text = arguments["text"]
element_index = arguments.get("element_index")
text = arguments.get("text")
reasoning = arguments.get("reasoning", "No reasoning provided")
ui_candidates = context.get("ui_candidates", [])

if element_index < 1 or element_index > len(ui_candidates):
if not text:
raise AssertionError("'input_text' requires text argument. Use tap_element to click without entering text.")

if element_index is None or element_index < 1 or element_index > len(ui_candidates):
raise AssertionError(
f"Invalid element_index: {element_index}. Must be 1-{len(ui_candidates)}"
)

if not text:
raise AssertionError("'input_text' requires text argument")

element = ui_candidates[element_index - 1]
x, y = get_element_center(element)

logger.info(f"🧠 AI reasoning: {reasoning}")
logger.debug(f"Tapping at ({x}, {y}) to focus, then input: '{text}'")
executor.run_keyword("Tap", [x, y])
executor.run_keyword("Sleep", "1s")
Expand Down
Loading
Loading