Arquivos
Agent-S/gui_agents/s3/agents/code_agent.py
T
alckasoc c2c1e1bba4 s3 🧠🤓🤯
2025-10-02 18:17:41 -07:00

279 linhas
11 KiB
Python

import logging
from typing import Dict, List, Tuple, Optional
from gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY
from gui_agents.s3.utils.common_utils import call_llm_safe, split_thinking_response
from gui_agents.s3.core.mllm import LMMAgent
logger = logging.getLogger("desktopenv.agent")
def extract_code_block(action: str) -> Tuple[Optional[str], Optional[str]]:
"""Extract code and determine type from action string."""
if "```python" in action:
code_type = "python"
code = action.split("```python")[1].split("```")[0].strip()
elif "```bash" in action:
code_type = "bash"
code = action.split("```bash")[1].split("```")[0].strip()
elif "```" in action:
code_type = None
code = action.split("```")[1].split("```")[0].strip()
else:
code_type = None
code = None
logger.debug(f"Extracted code block: type={code_type}, length={len(code) if code else 0}")
return code_type, code
def execute_code(code_type: str, code: str, env_controller) -> Dict:
"""Execute code based on its type."""
# Log the full code being executed (untruncated)
logger.info(f"CODING_AGENT_CODE_EXECUTION - Type: {code_type}\nCode:\n{code}")
try:
if code_type == "bash":
result = env_controller.run_bash_script(code, timeout=30)
elif code_type == "python":
result = env_controller.run_python_script(code)
else:
result = {"status": "error", "error": f"Unknown code type: {code_type}"}
return result
except Exception as e:
logger.error(f"Error executing {code_type} code: {e}")
return {"status": "error", "error": str(e)}
def format_result(result: Dict, step_count: int) -> str:
"""Format execution result into context string."""
if not result:
logger.warning(f"Step {step_count + 1}: No result returned from execution")
return f"""
Step {step_count + 1} Error:
Error: No result returned from execution
"""
status = result.get('status', 'unknown')
return_code = result.get('returncode', result.get('return_code', -1))
# Handle different response structures for bash vs python
if 'returncode' in result:
# Bash script response
output = result.get('output', '') # Contains both stdout and stderr merged
error = result.get('error', '') # Always empty for bash
else:
# Python script response
output = result.get('output', '') # stdout only
error = result.get('error', '') # stderr only
logger.debug(f"Step {step_count + 1}: Status={status}, Return Code={return_code}")
# Format with better structure for multi-line outputs
result_text = f"Step {step_count + 1} Result:\n"
result_text += f"Status: {status}\n"
result_text += f"Return Code: {return_code}\n"
if output:
result_text += f"Output:\n{output}\n"
if error:
result_text += f"Error:\n{error}\n"
return result_text
class CodeAgent:
"""A dedicated agent for executing code with a budget of steps."""
def __init__(self, engine_params: Dict, budget: int = 20):
"""Initialize the CodeAgent."""
if not engine_params:
raise ValueError("engine_params cannot be None or empty")
self.engine_params = engine_params
self.budget = budget
self.agent = None
logger.info(f"CodeAgent initialized with budget={budget}")
self.reset()
def reset(self):
"""Reset the code agent state."""
logger.debug("Resetting CodeAgent state")
self.agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.CODE_AGENT_PROMPT
)
def execute(self, task_instruction: str, screenshot: str, env_controller) -> Dict:
"""Execute code for the given task with a budget of steps."""
logger.info(f"Starting code execution for task: {task_instruction}")
logger.info(f"Budget: {self.budget} steps")
self.reset()
# Add initial task instruction and screenshot context as user message
context = f"Task: {task_instruction}\n\nCurrent screenshot is provided for context."
self.agent.add_message(context, image_content=screenshot, role="user")
step_count = 0
execution_history = []
while step_count < self.budget:
logger.info(f"Step {step_count + 1}/{self.budget}")
# Get assistant response (thoughts and code)
response = call_llm_safe(self.agent, temperature=1)
# Log the latest message from the coding agent (untruncated)
logger.info(f"CODING_AGENT_LATEST_MESSAGE - Step {step_count + 1}:\n{response}")
# Check if response is None or empty
if not response or response.strip() == "":
error_msg = f"Step {step_count + 1}: LLM returned empty response"
logger.error(error_msg)
raise RuntimeError(error_msg)
# Parse the response to extract action
action, thoughts = split_thinking_response(response)
execution_history.append({
"step": step_count + 1,
"action": action,
"thoughts": thoughts
})
# Check for completion signals
action_upper = action.upper().strip()
if action_upper == "DONE":
logger.info(f"Step {step_count + 1}: Task completed successfully")
completion_reason = "DONE"
break
elif action_upper == "FAIL":
logger.info(f"Step {step_count + 1}: Task failed by agent request")
completion_reason = "FAIL"
break
# Extract and execute code
code_type, code = extract_code_block(action)
if code:
result = execute_code(code_type, code, env_controller)
# Prepare formatted output and error for logging
output = result.get("output", "")
error = result.get("error", "")
message = result.get("message", "")
status = result.get("status", "")
log_lines = [
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:",
f"Status: {status}" if status else None,
]
if output:
log_lines.append("Output:\n" + ("-" * 40) + f"\n{output}\n" + ("-" * 40))
if error:
log_lines.append("Error:\n" + ("!" * 40) + f"\n{error}\n" + ("!" * 40))
if message and not output and not error:
log_lines.append("Message:\n" + ("-" * 40) + f"\n{message}\n" + ("-" * 40))
# Remove None entries and join
formatted_log = "\n".join([line for line in log_lines if line])
logger.info(formatted_log)
else:
logger.warning(f"Step {step_count + 1}: No code block found in action")
result = {"status": "skipped", "message": "No code block found"}
logger.info(
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\n"
f"Status: skipped\n"
f"Message:\n{'-' * 40}\n{result['message']}\n{'-' * 40}"
)
# Add assistant's thoughts and code to message history
self.agent.add_message(response, role="assistant")
# Process result and add formatted environment results as user message
result_context = format_result(result, step_count)
self.agent.add_message(result_context, role="user")
step_count += 1
# Handle budget exhaustion
if 'completion_reason' not in locals():
logger.info(f"Budget exhausted after {step_count} steps")
completion_reason = f"BUDGET_EXHAUSTED_AFTER_{step_count}_STEPS"
# Generate final summary
logger.info("Generating execution summary")
summary = self._generate_summary(execution_history, task_instruction)
result = {
"task_instruction": task_instruction,
"completion_reason": completion_reason,
"summary": summary,
"execution_history": execution_history,
"steps_executed": step_count,
"budget": self.budget
}
logger.info(f"Code execution completed: steps={step_count}")
return result
def _generate_summary(self, execution_history: List[Dict], task_instruction: str) -> str:
"""Generate summary of code execution session."""
if not execution_history:
logger.info("No execution history to summarize")
return "No actions were executed."
logger.info(f"Generated summary for {len(execution_history)} steps")
# Build detailed execution context for summary agent
execution_context = f"Task: {task_instruction}\n\nExecution Steps:\n"
for step in execution_history:
step_num = step['step']
thoughts = step.get('thoughts', '')
action = step.get('action', '')
execution_context += f"\nStep {step_num}:\n"
if thoughts:
execution_context += f"Thoughts: {thoughts}\n"
execution_context += f"Code: {action}\n"
# Create summary prompt with same context as coding agent
summary_prompt = f"""
{execution_context}
Please provide a concise summary of the code execution session. Focus on:
1. The code logic implemented at each step
2. The outputs and results produced by each code execution
3. The progression of the solution approach
Do not make judgments about success or failure. Simply describe what was attempted and what resulted.
Keep the summary under 150 words and use clear, factual language.
"""
# Generate summary using LLM with dedicated summary system prompt
try:
summary_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.CODE_SUMMARY_AGENT_PROMPT
)
summary_agent.add_message(summary_prompt, role="user")
summary = call_llm_safe(summary_agent, temperature=1)
if not summary or summary.strip() == "":
summary = "Summary generation failed - no response from LLM"
logger.warning("Summary generation failed - empty response from LLM")
except Exception as e:
summary = f"Summary generation failed: {str(e)}"
logger.error(f"Error generating summary: {e}")
return summary