101 linhas
4.3 KiB
Python
101 linhas
4.3 KiB
Python
import os
|
|
import base64
|
|
from typing import List, Tuple, Optional, List
|
|
|
|
from gui_agents.s3.core.mllm import LMMAgent
|
|
from gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY
|
|
from gui_agents.s3.utils.common_utils import call_llm_formatted, split_thinking_response
|
|
|
|
|
|
def get_final_screenshot_file(task_dir: str) -> str:
|
|
"""Get the final screenshot file name from a task directory."""
|
|
screenshot_files = []
|
|
for filename in os.listdir(task_dir):
|
|
if filename.startswith("step_") and filename.endswith(".png"):
|
|
screenshot_files.append(filename)
|
|
|
|
if not screenshot_files:
|
|
return "step_0.png" # fallback
|
|
|
|
# Sort by step number and get the last one
|
|
def extract_step_num(filename):
|
|
try:
|
|
return int(filename.split("_")[1].split(".")[0])
|
|
except:
|
|
return 0
|
|
|
|
screenshot_files.sort(key=extract_step_num)
|
|
return screenshot_files[-1]
|
|
|
|
|
|
def image_to_openai_message_format(image_path: str, caption: str = "") -> Optional[dict]:
|
|
"""Convert an image file to OpenAI message format."""
|
|
if not os.path.exists(image_path):
|
|
return None
|
|
|
|
try:
|
|
with open(image_path, "rb") as image_file:
|
|
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|
|
|
content = []
|
|
if caption:
|
|
content.append({"type": "text", "text": caption})
|
|
|
|
content.append({
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": f"data:image/png;base64,{image_data}",
|
|
"detail": "high"
|
|
}
|
|
})
|
|
|
|
return {"role": "user", "content": content}
|
|
except Exception as e:
|
|
print(f"Error loading image {image_path}: {e}")
|
|
return None
|
|
|
|
|
|
class ComparativeJudge:
|
|
def __init__(self, engine_params):
|
|
self.judge_agent = LMMAgent(engine_params=engine_params)
|
|
|
|
def judge(self, task_description: str, task: str, result_dirs: List[str], all_fact_captions: List[List[str]]) -> Tuple[str, str, Optional[str]]:
|
|
"""
|
|
Fact captions + initial/final screenshots judging.
|
|
Pipeline: use provided fact captions → include initial/final screenshots → judge.
|
|
"""
|
|
num_trajectories = len(result_dirs)
|
|
system_prompt = PROCEDURAL_MEMORY.VLM_EVALUATOR_PROMPT_COMPARATIVE_BASELINE
|
|
system_prompt = system_prompt.replace("<TASK_DESCRIPTION_INPUT>", task_description)
|
|
system_prompt = system_prompt.replace("<NUMBER OF TRAJECTORIES>", str(num_trajectories))
|
|
|
|
messages = [{"role": "system", "content": system_prompt}]
|
|
|
|
for i, (result_dir, fact_captions) in enumerate(zip(result_dirs, all_fact_captions)):
|
|
task_dir = os.path.join(result_dir, task.split("/")[0], task.split("/")[1])
|
|
result_initial_screenshot = os.path.join(task_dir, "step_0.png")
|
|
result_final_screenshot = os.path.join(task_dir, get_final_screenshot_file(task_dir))
|
|
initial_screenshot_message = image_to_openai_message_format(result_initial_screenshot, caption=f"Initial screenshot of result{i+1}")
|
|
final_screenshot_message = image_to_openai_message_format(result_final_screenshot, caption=f"Final screenshot of result{i+1}")
|
|
if initial_screenshot_message is not None and final_screenshot_message is not None:
|
|
messages.append(initial_screenshot_message)
|
|
messages.append(final_screenshot_message)
|
|
if fact_captions:
|
|
messages.append({"role": "user", "content": [{"type": "text", "text": f"Fact captions for Trajectory {i+1}:"}] + [{"type": "text", "text": caption} for caption in fact_captions]})
|
|
|
|
messages.append({"role": "user", "content": [{"type": "text", "text": f"Please evaluate the {num_trajectories} trajectories based on the criteria provided in the system prompt."}]})
|
|
|
|
response = call_llm_formatted(self.judge_agent, [], messages=messages)
|
|
answer, thoughts = split_thinking_response(response)
|
|
|
|
try:
|
|
judge_choice = int(answer)
|
|
if 1 <= judge_choice <= num_trajectories:
|
|
selected_trajectory = result_dirs[judge_choice - 1]
|
|
else:
|
|
selected_trajectory = None
|
|
except ValueError:
|
|
selected_trajectory = None
|
|
|
|
return answer, thoughts, selected_trajectory
|