add openaci support, license update, separated kbs

2024-10-10 19:08:15 -07:00
commit ab243e9352
@@ -0,0 +1,163 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+logs/
@@ -1,21 +1,201 @@
-MIT License
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/

-Copyright (c) 2024 Simular, Inc.
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+   1. Definitions.

-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.

-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -32,7 +32,10 @@ agent = GraphSearchAgent(
    action_space=args.action_space,
    observation_type=args.observation_type,
    max_trajectory_length=args.max_trajectory_length,
-    vm_version=args.vm_version,
+    vm_version='latest'
 )
 ```
-The permissible values for the model argument are `gpt-4o`, `gpt-4o-mini` for OpenAI models, and `claude-3-5-sonnet-20240620` for Anthropic models. 
+The permissible values for the model argument are `gpt-4o`, `gpt-4o-mini` for OpenAI models, and `claude-3-5-sonnet-20240620` for Anthropic models. 
+
+We have set the latest Agent S to use the latest Ubuntu VM image from OSWorld. However, our experiments are based on the older version of the VM. To reproduce the results, set the vm_version argument to 'old' while instantiating the agent.
+
@@ -109,7 +109,7 @@ To deploy Agent S in WindowsAgentArena, follow the [WindowsAgentArena Deployment

 We support running Agent S directly on your own system through [OpenACI](https://github.com/simular-ai/OpenACI). To run Agent S on your own system run: 
 ```
-python openaci/cli_app.py --agent agent_s --model <MODEL>
+python examples/cli_app.py --model <MODEL>
 ```

 This will show a user query prompt where you can enter your query and interact with Agent S. 
@@ -1,9 +1,10 @@
 from agent_s.ProceduralMemory import PROCEDURAL_MEMORY
-from agent_s.agent_s.osworld.GroundingAgent import GroundingAgent
+# from agent_s.osworld.GroundingAgent import GroundingAgent
 from agent_s.MultimodalEngine import OpenAIEmbeddingEngine
 import numpy as np
 import json
 import pickle
+import platform
 import os
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Dict, List, Tuple
@@ -19,9 +20,6 @@ logger = logging.getLogger("desktopenv.agent")
 # Get the directory of the current script
 working_dir = os.path.dirname(os.path.abspath(__file__))

-# Construct the full path to the JSON file
-file_path = os.path.join(working_dir, "kb", "formulate_query.json")
-
 NUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision


@@ -29,10 +27,11 @@ class Planner:
    def __init__(
        self,
        engine_params: Dict,
-        grounding_agent: GroundingAgent,
+        grounding_agent,
        search_engine: str = "perplexica",
        use_plan_cache: bool = False,
        multi_round: bool = False,
+        experiment_type: str = "osworld",
    ):
        # TODO: move the prompt to Procedural Memory
        self.generator_agent = LMMAgent(
@@ -59,11 +58,12 @@ class Planner:
        self.search_engine = search_engine
        self.use_plan_cache = use_plan_cache
        self.multi_round = multi_round
+        self.experiment_type = experiment_type

-        self.plan_cache_path = os.path.join(working_dir, "kb", "graph_agent_plans.json")
-        self.dag_cache_path = os.path.join(working_dir, "kb", "graph_agent_dags.json")
+        self.plan_cache_path = os.path.join(working_dir, "kb", self.experiment_type, "graph_agent_plans.json")
+        self.dag_cache_path = os.path.join(working_dir, "kb", self.experiment_type, "graph_agent_dags.json")
        self.search_query_cache_path = os.path.join(
-            working_dir, "kb", "formulate_query.json"
+            working_dir, "kb", self.experiment_type, "formulate_query.json"
        )

        # open cache files
@@ -90,10 +90,12 @@ class Planner:
        search_results = ""
        # Formulate query for searching
        try:
-            query_path = os.path.join(working_dir, "kb", "formulate_query.json")
+            query_path = os.path.join(working_dir, "kb", self.experiment_type, "formulate_query.json")
            formulate_query = json.load(open(query_path))
        except:
            formulate_query = {}
+        
+        print('query', formulate_query)

        if instruction in formulate_query.keys() and formulate_query[instruction]:
            search_query = formulate_query[instruction]
@@ -109,9 +111,20 @@ class Planner:
                    "TASK_DESCRIPTION", instruction
                ).replace("ACCESSIBLITY_TREE", current_state),
            )
-
+            if self.experiment_type == "osworld":
+                current_os = 'Ubuntu'
+            elif self.experiment_type == "windowsagentarena":
+                current_os = 'Windows 11'
+            elif self.experiment_type == "openaci":
+                if platform.system() == "Linux":
+                    current_os = 'Ubuntu'
+                elif platform.system() == "Windows":
+                    current_os = 'Windows 11'
+                elif platform.system() == "Darwin":
+                    current_os = 'MacOS'
+            print(current_os)
            self.rag_agent.add_message(
-                "To use google search to get some useful information, first carefully analyze the accessibility tree of the current desktop UI state, then given the task instruction, formulate a question that can be used to search on the Internet for information in helping with the task execution.\nThe question should not be too general or too specific, but it should be based on the current desktop UI state (e.g., already open website or application). You should expect the google search will return you something useful based on the question. Since it is a desktop computer task, make sure to mention the corresponding task domain in the question and also mention the Ubuntu OS if you think the OS matters. Please ONLY provide the question.\nQuestion:"
+                f"To use google search to get some useful information, first carefully analyze the accessibility tree of the current desktop UI state, then given the task instruction, formulate a question that can be used to search on the Internet for information in helping with the task execution.\nThe question should not be too general or too specific, but it should be based on the current desktop UI state (e.g., already open website or application). You should expect the google search will return you something useful based on the question. Since it is a desktop computer task, make sure to mention the corresponding task domain in the question and also mention the {current_os} OS if you think the OS matters. Please ONLY provide the question.\nQuestion:"
            )
            search_query = call_llm_safe(self.rag_agent)
            assert type(search_query) == str
@@ -130,7 +143,7 @@ class Planner:
        # Search from different engines
        if engine == "llm":
            logger.info("Search Engine: LLM")
-            file = os.path.join(working_dir, "kb", "llm_rag_knowledge.json")
+            file = os.path.join(working_dir, "kb", self.experiment_type, "llm_rag_knowledge.json")

            try:
                exist_search_results = json.load(open(file))
@@ -154,7 +167,7 @@ class Planner:

        elif engine == "perplexica":
            logger.info("Search Engine: Perplexica Search")
-            file = os.path.join(working_dir, "kb", "perplexica_rag_knowledge.json")
+            file = os.path.join(working_dir, "kb", self.experiment_type, "perplexica_rag_knowledge.json")

            try:
                exist_search_results = json.load(open(file))
@@ -210,7 +223,7 @@ class Planner:
            lifelong_learning_reflection_dicts = json.load(
                open(
                    os.path.join(
-                        working_dir, "kb", "lifelong_learning_knowledge_base.json"
+                        working_dir, "kb", self.experiment_type, "lifelong_learning_knowledge_base.json"
                    )
                )
            )
@@ -228,13 +241,13 @@ class Planner:
            knowledge_base_dict = json.load(
                open(
                    os.path.join(
-                        working_dir, "kb", "lifelong_learning_knowledge_base.json"
+                        working_dir, "kb", self.experiment_type, "lifelong_learning_knowledge_base.json"
                    )
                )
            )

            try:
-                with open(os.path.join(working_dir, "kb", "embeddings.pkl"), "rb") as f:
+                with open(os.path.join(working_dir, "kb",  "embeddings.pkl"), "rb") as f:
                    embeddings = pickle.load(f)
            except:
                embeddings = {}
@@ -257,7 +270,7 @@ class Planner:
                candidate_embeddings.append(candidate_embedding)
            candidate_embeddings = np.vstack(candidate_embeddings)

-            with open(os.path.join(working_dir, "kb", "embeddings.pkl"), "wb") as f:
+            with open(os.path.join(working_dir, "kb", self.experiment_type, "embeddings.pkl"), "wb") as f:
                pickle.dump(embeddings, f)

            # instruction_embedding = self.embedding_engine.get_embeddings(instruction)
@@ -6,16 +6,17 @@ import time

 import pyautogui
 import io 
+import json 

 try:
    if platform.system() == 'Darwin':
        platform_name = 'macos'
        from openaci.macos.Grounding import GroundingAgent as OpenACIGroundingAgent
-        from openaci.macos.Grounding import OpenACIUIElement
+        from openaci.macos.Grounding import UIElement as OpenACIUIElement
    elif platform.system() == 'Linux':
        platform_name = 'ubuntu'
        from openaci.ubuntu.Grounding import GroundingAgent as OpenACIGroundingAgent
-        from openaci.ubuntu.UIElement import OpenACIUIElement
+        from openaci.ubuntu.Grounding import UIElement as OpenACIUIElement
 except ImportError:
    print(
        "The OpenACI package is not installed. To use the OpenACI grounding agent for Agent S, please install the OpenACI package."
@@ -54,7 +55,7 @@ class GraphSearchAgent:
                 a11y_tree_max_tokens=10000,
                 enable_reflection=True,
                 engine="perplexica",
-                 vm_version="old"):
+                 vm_version="new"):

        # resets the agent by initializing submodules
        self.experiment_type = experiment_type
@@ -80,8 +81,8 @@ class GraphSearchAgent:
        else:
            raise ValueError(f"Invalid experiment type: {self.experiment_type}")

-        self.planner = Planner(self.engine_params, self.grounding_agent)
-        self.executor = Executor(self.engine_params, self.grounding_agent)
+        self.planner = Planner(self.engine_params, self.grounding_agent, experiment_type=self.experiment_type)
+        self.executor = Executor(self.engine_params, self.grounding_agent, experiment_type=self.experiment_type)
        self.replan = True
        self.get_next_subtask = True
        self.step_count = 0
@@ -187,12 +188,67 @@ class GraphSearchAgent:
        'subtask_info': self.current_subtask.info,
        'subtask_status': self.subtask_status})

-        if self.self_eval:
-            curr_atree = self.grounding_agent.linearize_and_annotate_tree(obs)
-            curr_img = obs['screenshot']
-            return info, (curr_atree, curr_img), actions
-        else:
-            return info, actions
+
+        return info, actions
+    
+    def update_narrative_memory(self, traj):
+        """
+        Update the narrative memory with the current observation.
+        """
+        try:
+            if self.pdate_reflection:
+                print(self.search_query)
+                try:
+                    reflection_path = os.path.join(working_dir, "kb", self.experiment_type, "lifelong_learning_knowledge_base.json")
+                    lifelong_learning_reflections = json.load(open(reflection_path))
+                except:
+                    lifelong_learning_reflections = {}
+                if self.planner.search_query not in lifelong_learning_reflections.keys():
+                    lifelong_learning_reflection = self.planner.generate_lifelong_learning_reflection(traj)
+                    lifelong_learning_reflections[self.search_query] = lifelong_learning_reflection
+                else:
+                    pass
+                with open(reflection_path, "w") as fout:
+                    json.dump(lifelong_learning_reflections, fout, indent=2)
+        except Exception as e:
+            print(e)
+
+    def update_episodic_memory(self, meta_data, subtask_traj):
+        """
+        Update the episodic memory with the current observation.
+        """
+        subtask = meta_data['subtask']
+        subtask_info = meta_data['subtask_info']
+        subtask_status = meta_data['subtask_status']
+        # Handle subtask trajectory
+        if subtask_status == 'Start' or subtask_status == 'Done':
+            # If it's a new subtask start, finalize the previous subtask trajectory if it exists
+            if subtask_traj:
+                subtask_traj += '\nSubtask Completed.\n'
+                subtask_key = subtask_traj.split("\n----------------------\n\nPlan:\n")[0]
+                try:
+                    subtask_path = os.path.join(working_dir, "kb", self.experiment_type, "subtask_experience_knowledge_base.json")
+                    kb = json.load(open(subtask_path))
+                except:
+                    kb = {}
+                if subtask_key not in kb.keys():
+                    subtask_summarization = self.planner.generate_subtask_summarization(subtask_traj)
+                    kb[subtask_key] = subtask_summarization
+                else:
+                    subtask_summarization = kb[subtask_key]
+                logger.info("subtask_key: %s", subtask_key)
+                logger.info("subtask_summarization: %s", subtask_summarization)
+                with open(subtask_path, "w") as fout:
+                    json.dump(kb, fout, indent=2)
+                # Reset for the next subtask
+                subtask_traj = ''
+            # Start a new subtask trajectory
+            subtask_traj = 'Task:\n' + self.search_query + '\n\nSubtask: ' + subtask + '\nSubtask Instruction: ' + subtask_info + '\n----------------------\n\nPlan:\n' + meta_data['executor_plan'] + '\n'
+        elif subtask_status == 'In':
+            # Continue appending to the current subtask trajectory if it's still ongoing
+            subtask_traj += '\n----------------------\n\nPlan:\n' + meta_data['executor_plan'] + '\n'
+
+        return subtask_traj
    
    def run(self, instruction: str):
        obs = {}
@@ -245,3 +301,4 @@ class GraphSearchAgent:
                # Update task and subtask trajectories and optionally the episodic memory
                traj += '\n\nReflection:\n' + str(info['reflection']) + '\n\n----------------------\n\nPlan:\n' + info['executor_plan']
                subtask_traj = self.update_episodic_memory(info, subtask_traj)
+                
@@ -1,7 +1,6 @@
 import inspect
 import textwrap

-current_os = "Ubuntu"


 class PROCEDURAL_MEMORY:
@@ -11,7 +10,7 @@ class PROCEDURAL_MEMORY:
            f"""\
        You are an expert in graphical user interfaces and Python code. You are responsible for executing the current subtask: `SUBTASK_DESCRIPTION` of the larger goal: `TASK_DESCRIPTION`.
        IMPORTANT: ** The subtasks: ['DONE_TASKS'] have already been done. The future subtasks ['FUTURE_TASKS'] will be done in the future by me. You must only perform the current subtask: `SUBTASK_DESCRIPTION`. Do not try to do future subtasks. **
-        You are working in {current_os}. You must only complete the subtask provided and not the larger goal.
+        You are working in CURRENT_OS. You must only complete the subtask provided and not the larger goal.
        You are provided with:
        1. A simplified accessibility tree of the UI at the current time step.
        2. A screenshot of the current time step.
@@ -313,7 +312,7 @@ class PROCEDURAL_MEMORY:

    RAG_AGENT = """
    Given a desktop computer task instruction, you are an agent which should provide useful information as requested, to help another agent follow the instruction and perform the task.
-    The domain of the desktop computer task is from [Ubuntu, VLC, LibreOffice, Chrome, Thunderbird, VS Code, GIMP].
+    The domain of the desktop computer task is from [CURRENT_OS, VLC, LibreOffice, Chrome, Thunderbird, VS Code, GIMP].
    The task is: TASK_DESCRIPTION
    The simplified accessibility tree of the current computer UI is: ACCESSIBLITY_TREE
    """
@@ -1,6 +1,6 @@
 import time
 from agent_s.ProceduralMemory import PROCEDURAL_MEMORY
-from agent_s.agent_s.osworld.GroundingAgent import GroundingAgent
+from agent_s.osworld.GroundingAgent import GroundingAgent
 from agent_s.MultimodalEngine import OpenAIEmbeddingEngine
 import numpy as np
 import json
@@ -1,9 +1,10 @@
 from agent_s.ProceduralMemory import PROCEDURAL_MEMORY
-from agent_s.agent_s.osworld.GroundingAgent import GroundingAgent
+from agent_s.osworld.GroundingAgent import GroundingAgent
 from agent_s.MultimodalEngine import OpenAIEmbeddingEngine
 import json
 import numpy as np
 import pickle
+import platform
 import os
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Dict, List, Tuple
@@ -51,6 +52,7 @@ class Executor:
        search_engine: str = "perplexica",
        enable_reflection: bool = True,
        use_subtask_experience: bool = True,
+        experiment_type: str = "osworld",
    ):
        self.grounding_agent = grounding_agent

@@ -58,6 +60,7 @@ class Executor:
        self.engine_params = engine_params
        self.search_engine = search_engine
        self.use_subtask_experience = use_subtask_experience
+        self.experiment_type = experiment_type
        self.reset()

    def flush_messages(self, n):
@@ -73,13 +76,27 @@ class Executor:
        self.reflection_agent = LMMAgent(self.engine_params)
        self.rag_agent = LMMAgent(self.engine_params)
        self.embedding_engine = OpenAIEmbeddingEngine()
+
+        if self.experiment_type == "osworld":
+            current_os = 'Ubuntu'
+        elif self.experiment_type == "windowsagentarena":
+            current_os = 'Windows 11'
+        elif self.experiment_type == "openaci":
+            if platform.system() == "Linux":
+                current_os = 'Ubuntu'
+            elif platform.system() == "Windows":
+                current_os = 'Windows 11'
+            elif platform.system() == "Darwin":
+                current_os = 'MacOS'
+        
        self.generator_system_prompt = PROCEDURAL_MEMORY.construct_procedural_memory(
            GroundingAgent
-        )
+        ).replace("CURRENT_OS", current_os)
        self.reflection_module_system_prompt = (
            PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY
        )
-        self.rag_module_system_prompt = PROCEDURAL_MEMORY.RAG_AGENT
+        self.rag_module_system_prompt = PROCEDURAL_MEMORY.RAG_AGENT.replace("CURRENT_OS", current_os)   
+        
        self.turn_count = 0
        self.planner_history = []
        self.reflections = []
@@ -93,7 +110,7 @@ class Executor:
            knowledge_base_dict = json.load(
                open(
                    os.path.join(
-                        working_dir, "kb", "subtask_experience_knowledge_base.json"
+                        working_dir, "kb", self.experiment_type, "subtask_experience_knowledge_base.json"
                    )
                )
            )
@@ -122,7 +139,7 @@ class Executor:
                candidate_embeddings.append(candidate_embedding)
            candidate_embeddings = np.vstack(candidate_embeddings)

-            with open(os.path.join(working_dir, "kb", "embeddings.pkl"), "wb") as f:
+            with open(os.path.join(working_dir, "kb", self.experiment_type, "embeddings.pkl"), "wb") as f:
                pickle.dump(embeddings, f)

            similarities = cosine_similarity(
@@ -151,7 +168,7 @@ class Executor:
    def retrieve_subtask_knowledge(self, instruction, current_state, engine):
        # Formulate query for searching
        try:
-            query_path = os.path.join(working_dir, "kb", "formulate_query.json")
+            query_path = os.path.join(working_dir, "kb", self.experiment_type, "formulate_query.json")
            formulate_query = json.load(open(query_path))
        except:
            formulate_query = {}
@@ -182,7 +199,7 @@ class Executor:
        # Search from different engines
        if engine == "llm":
            logger.info("Search Engine: LLM")
-            file = os.path.join(working_dir, "kb", "llm_rag_knowledge.json")
+            file = os.path.join(working_dir, "kb", self.experiment_type, "llm_rag_knowledge.json")

            try:
                exist_search_results = json.load(open(file))
@@ -204,7 +221,7 @@ class Executor:

        elif engine == "perplexica":
            logger.info("Search Engine: Perplexica Search")
-            file = os.path.join(working_dir, "kb", "perplexica_rag_knowledge.json")
+            file = os.path.join(working_dir, "kb", self.experiment_type, "perplexica_rag_knowledge.json")

            try:
                exist_search_results = json.load(open(file))
@@ -1,7 +1,7 @@
 import time
 import xml.etree.ElementTree as ET
 from agent_s.ProceduralMemory import PROCEDURAL_MEMORY
-from agent_s.agent_s.osworld.GroundingAgent import GroundingAgent
+from agent_s.osworld.GroundingAgent import GroundingAgent
 from agent_s.MultimodalEngine import OpenAIEmbeddingEngine
 import numpy as np
 import json
@@ -0,0 +1,2 @@
+{
+}
@@ -0,0 +1 @@
+{"disable autosave please": "{\"nodes\":[{\"name\":\"Open Settings\",\"info\":\"Click on the Code menu in the top menu bar. Select Preferences, then Settings. Alternatively, use the shortcut Cmd + ,.\"},{\"name\":\"Search for Auto Save\",\"info\":\"In the Settings editor, type \\\"autosave\\\" into the search bar.\"},{\"name\":\"Disable Auto Save\",\"info\":\"Find the setting labeled Files: Auto Save. Click on the dropdown menu next to it and select off.\"}],\"edges\":[[{\"name\":\"Open Settings\",\"info\":\"Click on the Code menu in the top menu bar. Select Preferences, then Settings. Alternatively, use the shortcut Cmd + ,.\"},{\"name\":\"Search for Auto Save\",\"info\":\"In the Settings editor, type \\\"autosave\\\" into the search bar.\"}],[{\"name\":\"Search for Auto Save\",\"info\":\"In the Settings editor, type \\\"autosave\\\" into the search bar.\"},{\"name\":\"Disable Auto Save\",\"info\":\"Find the setting labeled Files: Auto Save. Click on the dropdown menu next to it and select off.\"}]]}"}
@@ -0,0 +1 @@
+{"disable autosave please": "To disable autosave in Visual Studio Code on macOS, follow these steps:\n\n1. **Open Settings**:\n   - Click on the **Code** menu in the top menu bar.\n   - Select **Preferences**, then **Settings**. Alternatively, use the shortcut **Cmd + ,**.\n\n2. **Search for Auto Save**:\n   - In the Settings editor, type \"autosave\" into the search bar.\n\n3. **Disable Auto Save**:\n   - Find the setting labeled **Files: Auto Save**.\n   - Click on the dropdown menu next to it and select **off**.\n\nThese steps will disable the auto-save feature, requiring manual saving of files."}
@@ -0,0 +1,2 @@
+{
+}
@@ -645,5 +645,10 @@
  "Open the blog at https://developer.apple.com/design/human-interface-guidelines/searching in Google Chrome, and use the browser's developer tools to extract the HTML content from 'searching' to just before 'resources.' Save this extracted HTML content as \"searching.html\" on the desktop.": "How to use Google Chrome developer tools to extract specific HTML content on Ubuntu?",
  "Could you please open the downloaded PDF file on the desktop and search for the term \"neural network\" within the document using the built-in search functionality in the PDF viewer?": "How to search for a term within a PDF document using the built-in search functionality in the default PDF viewer on Ubuntu?",
  "I need to convert a large DOCX file to PDF using the command line on my Ubuntu virtual machine. The DOCX file is located on my Desktop. Can you guide me through the process?": "How to convert a DOCX file to PDF using the command line on Ubuntu?",
-  "I am looking for an website address I accessed a month ago, but Youtube websites which take almost all of my browsing history are interrupting my search. This is too annoying. I want to remove all my Youtube browsing history first to facilitate my search. Could you help me clear browsing history from Youtube?": "How to clear YouTube browsing history in Chromium on Ubuntu?"
+  "I am looking for an website address I accessed a month ago, but Youtube websites which take almost all of my browsing history are interrupting my search. This is too annoying. I want to remove all my Youtube browsing history first to facilitate my search. Could you help me clear browsing history from Youtube?": "How to clear YouTube browsing history in Chromium on Ubuntu?",
+  "search for plants on safari": "How to search for plants using Google on Ubuntu with Chrome browser?",
+  "search for animals on safari": "How can I use VS Code on Ubuntu to automate web searches for animal information using Python scripts?",
+  "disable autosave": "How to disable autosave in VS Code on Ubuntu?",
+  "disable autosave on vscode": "How to disable autosave feature in Visual Studio Code on Ubuntu?",
+  "disable autosave ": "How to disable autosave in VS Code on Ubuntu?"
 }
@@ -645,5 +645,8 @@
  "Open the blog at https://developer.apple.com/design/human-interface-guidelines/searching in Google Chrome, and use the browser's developer tools to extract the HTML content from 'searching' to just before 'resources.' Save this extracted HTML content as \"searching.html\" on the desktop.": "Using Google Chrome Developer Tools to extract specific HTML content on Ubuntu involves a few straightforward steps. These tools are integrated into the Chrome browser and provide developers with powerful capabilities for inspecting and manipulating web pages. Here's how you can use them effectively:\n\n### Steps to Extract HTML Content\n\n1. **Open Chrome Developer Tools**:\n   - You can open Developer Tools by right-clicking on the webpage and selecting \"Inspect\" or by using the keyboard shortcut `Ctrl + Shift + I` on Ubuntu.\n\n2. **Navigate to the Elements Tab**:\n   - Once Developer Tools is open, navigate to the \"Elements\" tab. This tab displays the HTML structure of the webpage you are inspecting. You can hover over elements to see their corresponding areas highlighted on the page.\n\n3. **Locate the Specific HTML Element**:\n   - Use the mouse to hover over elements in the \"Elements\" tab or directly on the webpage. You can click on any element to reveal its HTML code in the panel. This is particularly useful for identifying the specific content you want to extract.\n\n4. **Copy the HTML**:\n   - After finding the desired element, right-click on it in the \"Elements\" tab. You will see options such as \"Copy\" and then choose \"Copy outerHTML\" or \"Copy innerHTML\" depending on whether you need the entire element or just its contents. This action copies the selected HTML to your clipboard [5][6].\n\n5. **Using the Console for More Specific Extraction**:\n   - If you want to extract data programmatically, you can switch to the \"Console\" tab. Here, you can run JavaScript commands to select elements and retrieve their HTML content. For example:\n     ```javascript\n     document.querySelector('your-selector').innerHTML\n     ```\n     Replace `'your-selector'` with the appropriate CSS selector for the element you want to extract. Press `Enter` to execute the command, and the HTML will be displayed in the console [4].\n\n### Alternative Method: Headless Mode\n\nIf you prefer to extract HTML content from the command line, you can use Chrome's headless mode. This is particularly useful for automation:\n\n1. **Open Terminal**:\n   - Open a terminal window in Ubuntu.\n\n2. **Run the Headless Command**:\n   - Use the following command to fetch the HTML of a webpage:\n     ```bash\n     google-chrome --headless --dump-dom 'URL' > ~/file.html\n     ```\n   - Replace `'URL'` with the actual URL of the webpage. This command will save the entire HTML DOM structure to a file named `file.html` in your home directory [8].\n\n### Summary\n\nUsing Google Chrome Developer Tools on Ubuntu allows you to easily inspect, copy, and manipulate HTML content. Whether you prefer a graphical interface or command-line automation, Chrome provides the necessary tools to achieve your goals effectively. By following the above steps, you can extract specific HTML content efficiently.",
  "Could you please open the downloaded PDF file on the desktop and search for the term \"neural network\" within the document using the built-in search functionality in the PDF viewer?": "To search for a term within a PDF document using the built-in search functionality in the default PDF viewer on Ubuntu, you can follow these steps:\n\n1. **Open Your PDF Document**: Start by launching the default PDF viewer, which is typically Evince, and open the PDF file you want to search.\n\n2. **Access the Search Function**: You can search for text in the PDF using several methods:\n   - **Keyboard Shortcut**: Press `Ctrl + F` to open the search bar directly. Alternatively, you can also use `Ctrl + B` to access the search feature.\n   - **Icon**: Look for a search icon (like a magnifying glass) within the application interface and click on it to open the search bar.\n   - **Menu Option**: Navigate to the menu, usually found under \"Edit\", and select the option for \"Find\" to activate the search functionality [1][2].\n\n3. **Enter Your Search Term**: In the search bar that appears, type in the word or phrase you would like to find within the document. \n\n4. **Initiate the Search**: After entering your desired term, press `Enter` or click on the search button to start searching through the document.\n\n5. **Navigate Through Results**: Use the provided navigation buttons (often arrows) to cycle through the search results. This will allow you to find each instance of the term you are searching for within the document [1][2].\n\nThese steps should help you efficiently locate text within any PDF file using Ubuntu's built-in viewer.",
  "I need to convert a large DOCX file to PDF using the command line on my Ubuntu virtual machine. The DOCX file is located on my Desktop. Can you guide me through the process?": "To convert a DOCX file to PDF using the command line on Ubuntu, you can utilize the LibreOffice command-line tools. Here\u2019s a step-by-step guide on how to accomplish this:\n\n### Prerequisites\n- Ensure that you have **LibreOffice** installed on your system. If not, you can install it using the following command:\n  ```bash\n  sudo apt-get install libreoffice\n  ```\n\n### Conversion Command\n1. **Open the Terminal**: You can do this by pressing `Ctrl + Alt + T` on your keyboard.\n   \n2. **Use the Command**: To convert a single DOCX file to PDF, you can run the following command:\n   ```bash\n   libreoffice --headless --convert-to pdf yourfile.docx\n   ```\n   Replace `yourfile.docx` with the actual name of your DOCX file. The `--headless` option allows LibreOffice to operate without a GUI, making it suitable for server environments or scripts.\n\n3. **Specify Output Directory (Optional)**: If you wish to specify an output directory for the converted PDF file, you can use:\n   ```bash\n   libreoffice --headless --convert-to pdf --outdir /path/to/output yourfile.docx\n   ```\n   Replace `/path/to/output` with your desired output directory path.\n\n### Batch Conversion\nIf you have multiple DOCX files you want to convert at once, you can use a wildcard to convert all DOCX files in a directory:\n```bash\nlibreoffice --headless --convert-to pdf *.docx\n```\nThis command will convert all DOCX files in the current directory to PDF format.\n\n### Additional Notes\n- The conversion process maintains the formatting of the original document, ensuring that the PDF looks similar to the DOCX file.\n- If you encounter any issues during installation or conversion, ensure that your LibreOffice installation is up to date and check for any dependencies that might be required.\n\nUsing these commands, you can efficiently convert DOCX documents to PDF format directly from the command line in Ubuntu [1][2][5][6][8].",
-  "I am looking for an website address I accessed a month ago, but Youtube websites which take almost all of my browsing history are interrupting my search. This is too annoying. I want to remove all my Youtube browsing history first to facilitate my search. Could you help me clear browsing history from Youtube?": "To clear your YouTube browsing history in Chromium on Ubuntu, you can follow these steps:\n\n1. **Open Chromium**: Launch the Chromium browser on your Ubuntu system.\n\n2. **Access History**:\n   - Click on the three-dot menu icon located in the top-right corner of the browser window. Or type chrome://settings/ in the URL bar and press Enter\n\n3. **Clear Browsing Data**:\n   -   Hover over \"More Tools\" and select \"Clear Browsing Data.\".\n\n4. **Select Time Range**: In the window that appears, you can choose the time range for which you want to clear the history (e.g., Last hour, Last 24 hours, All time).\n\n5. **Choose Data to Clear:**:\n   - Ensure that \"Browsing history\" is checked, and you can also select other options like \"Cookies and other site data\" and \"Cached images and files\".\n\n6. **Clear Data**: Click on the \"Clear data\" button to remove the selected items from your history."
+  "I am looking for an website address I accessed a month ago, but Youtube websites which take almost all of my browsing history are interrupting my search. This is too annoying. I want to remove all my Youtube browsing history first to facilitate my search. Could you help me clear browsing history from Youtube?": "To clear your YouTube browsing history in Chromium on Ubuntu, you can follow these steps:\n\n1. **Open Chromium**: Launch the Chromium browser on your Ubuntu system.\n\n2. **Access History**:\n   - Click on the three-dot menu icon located in the top-right corner of the browser window. Or type chrome://settings/ in the URL bar and press Enter\n\n3. **Clear Browsing Data**:\n   -   Hover over \"More Tools\" and select \"Clear Browsing Data.\".\n\n4. **Select Time Range**: In the window that appears, you can choose the time range for which you want to clear the history (e.g., Last hour, Last 24 hours, All time).\n\n5. **Choose Data to Clear:**:\n   - Ensure that \"Browsing history\" is checked, and you can also select other options like \"Cookies and other site data\" and \"Cached images and files\".\n\n6. **Clear Data**: Click on the \"Clear data\" button to remove the selected items from your history.",
+  "search for animals on safari": "To automate web searches for animal information using Python scripts in Visual Studio Code (VS Code) on Ubuntu, you can follow these steps:\n\n### 1. Install Visual Studio Code and Python\n- **Install VS Code**: If you haven't already, download and install Visual Studio Code on your Ubuntu system. You can usually find it in the Ubuntu Software Center or install it via terminal commands.\n- **Install Python**: Ensure that Python is installed on your system. You can check this by running `python3 --version` in the terminal. If it's not installed, you can install it using `sudo apt install python3`.\n\n### 2. Set Up VS Code for Python Development\n- **Launch VS Code**: Open VS Code from your applications menu.\n- **Install Python Extension**: Go to the Extensions view by clicking on the Extensions icon or pressing `Ctrl+Shift+X`. Search for \"Python\" and install the official Microsoft Python extension. Restart VS Code if prompted [2][3].\n\n### 3. Create a Python Script\n- **Create a New File**: Open a new file in VS Code and save it with a `.py` extension, for example, `animal_search.py`.\n- **Write Your Script**: Use Python libraries such as `requests` and `BeautifulSoup` to scrape web data. Here\u2019s a simple example to get you started:\n\n```python\nimport requests\nfrom bs4 import BeautifulSoup\n\n# Function to perform a web search\ndef search_animal_info(animal):\n    url = f\"https://www.google.com/search?q={animal}+information\"\n    headers = {'User-Agent': 'Mozilla/5.0'}\n    response = requests.get(url, headers=headers)\n\n    if response.status_code == 200:\n        soup = BeautifulSoup(response.text, 'html.parser')\n        results = soup.find_all('h3')  # modify based on the structure of the search results\n        for result in results:\n            print(result.text)\n    else:\n        print(\"Failed to retrieve information\")\n\n# Example usage\nsearch_animal_info(\"lion\")\n```\n\n### 4. Install Required Libraries\n- **Open the Terminal**: You can open the integrated terminal in VS Code by pressing `` Ctrl+` ``.\n- **Install Libraries**: If you haven't installed the `requests` and `BeautifulSoup` libraries, you can do so by running:\n\n```bash\npip install requests beautifulsoup4\n```\n\n### 5. Run Your Script\n- **Execute the Script**: In the terminal, ensure you are in the directory where your script is saved, and run the script using:\n\n```bash\npython3 animal_search.py\n```\n\n### 6. Automate and Extend\n- **Add Features**: You can extend the script to save results to a file, handle multiple searches, or even use libraries like `Selenium` for more complex tasks such as interacting with web elements dynamically.\n- **Use Virtual Environments**: To manage dependencies effectively, consider using virtual environments. You can create one in VS Code by using the Command Palette (`Ctrl+Shift+P`), then selecting `Python: Create Environment` and choosing your preferred environment type [8][14].\n\nBy following these steps, you can effectively use VS Code on Ubuntu to automate web searches for animal information using Python scripts. This setup allows for both simple and complex scraping tasks, depending on your requirements.",
+  "disable autosave": "To disable autosave in Visual Studio Code on Ubuntu, you can follow these steps:\n\n- **Menu Navigation**: Open Visual Studio Code and navigate to the top menu. Click on **File**, then locate the **Auto Save** option in the drop-down menu. This allows you to toggle the autosave feature on or off directly [2][14].\n\n- **Settings Editor**: Alternatively, you can access the settings editor for more granular control. Click on the gear icon in the bottom left corner of the window and select **Settings**. In the settings editor, type \"autoSave\" into the search bar to find the relevant setting. You will see an option labeled **Auto Save** under the Editor section. Uncheck this box to disable automatic saving [5][8].\n\n- **Keyboard Shortcut**: You can also use a keyboard shortcut to toggle autosave. Type the command **File: Toggle Auto Save** in the command palette (accessed by pressing `Ctrl + Shift + P`). This will enable or disable the autosave feature depending on its current state [11].\n\nOnce you have completed these steps, autosave should be disabled in your Visual Studio Code environment on Ubuntu. This will allow you to manage your file saves manually, reducing the risk of unintended changes being saved automatically.",
+  "disable autosave ": "To disable the autosave feature in Visual Studio Code (VS Code) on Ubuntu, you can follow these steps:\n\n- **Access the File Menu**: Open VS Code and click on the **File** menu at the top of the window. From the drop-down options, locate **Auto Save** to toggle the setting [2][7].\n\n- **Using the Settings Editor**: Alternatively, you can access the settings directly. Click on the gear icon (\u2699\ufe0f) located in the lower-left corner of the VS Code window and select **Settings**. In the Settings editor, type \"autoSave\" into the search bar to find the relevant setting. You will see an option labeled **Auto Save**. Uncheck the box to disable it [8][11].\n\n- **Toggling via Command Palette**: You can also disable autosave using the Command Palette. Press `Ctrl + Shift + P` to open the Command Palette, then type **Toggle Auto Save**. Selecting this option will switch the autosave feature off if it was enabled [6].\n\n- **Confirmation of Change**: After disabling autosave, you should notice that the \"dirty\" indicator (which shows when a file has unsaved changes) will reappear when you modify a saved file, confirming that autosave has been turned off [3].\n\nThese steps are applicable to both Windows and macOS, so you can follow them easily on your Ubuntu setup as well [7]."
 }
@@ -17,7 +17,7 @@ def agent_action(func):


 class GroundingAgent:
-    def __init__(self, top_app=None, vm_version="old", top_app_only=True, ocr=True):
+    def __init__(self, top_app=None, vm_version="new", top_app_only=True, ocr=True):
        self.active_apps = set()
        self.top_app = top_app
        self.top_app_only = (
@@ -0,0 +1,110 @@
+import os 
+import datetime 
+import base64
+import io
+import pyautogui
+import platform 
+import logging
+import sys
+import time 
+import argparse
+
+if platform.system() == 'Darwin':
+    from openaci.macos.UIElement import UIElement
+    from Foundation import *
+    from AppKit import *
+    from ApplicationServices import (
+        AXIsProcessTrusted,
+        AXUIElementCreateApplication,
+        AXUIElementCreateSystemWide,
+        CFEqual,
+    )
+
+    from ApplicationServices import (
+        AXUIElementCopyAttributeNames,
+        AXUIElementCopyAttributeValue,
+    )
+elif platform.system() == 'Linux':
+    from openaci.ubuntu.UIElement import UIElement
+
+from agent_s.GraphSearchAgent import GraphSearchAgent
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+
+log_dir = "logs"
+os.makedirs(log_dir, exist_ok=True)
+
+file_handler = logging.FileHandler(
+    os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
+)
+debug_handler = logging.FileHandler(
+    os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
+)
+stdout_handler = logging.StreamHandler(sys.stdout)
+sdebug_handler = logging.FileHandler(
+    os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8"
+)
+
+file_handler.setLevel(logging.INFO)
+debug_handler.setLevel(logging.DEBUG)
+stdout_handler.setLevel(logging.INFO)
+sdebug_handler.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter(
+    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
+)
+file_handler.setFormatter(formatter)
+debug_handler.setFormatter(formatter)
+stdout_handler.setFormatter(formatter)
+sdebug_handler.setFormatter(formatter)
+
+stdout_handler.addFilter(logging.Filter("desktopenv"))
+sdebug_handler.addFilter(logging.Filter("desktopenv"))
+
+logger.addHandler(file_handler)
+logger.addHandler(debug_handler)
+logger.addHandler(stdout_handler)
+logger.addHandler(sdebug_handler)
+
+platform_os = platform.system() 
+
+def main():
+    parser = argparse.ArgumentParser(description="Run GraphSearchAgent with specified model.")
+    parser.add_argument("--model", type=str, default="gpt-4o", help="Specify the model to use (e.g., gpt-4o)")
+    args = parser.parse_args()
+
+    while True:
+        query = input("Query: ")
+        if 'gpt' in args.model:
+            engine_type = 'openai'
+        elif 'claude' in args.model:
+            engine_type = 'anthropic'
+        engine_params = {
+            "engine_type": engine_type,
+            "model": args.model,
+        }
+        agent = GraphSearchAgent(
+            engine_params,
+            experiment_type='openaci',
+            platform=platform_os,
+            max_tokens=1500,
+            top_p=0.9,
+            temperature=0.5,
+            action_space="pyautogui",
+            observation_type="atree",
+            max_trajectory_length=3,
+            a11y_tree_max_tokens=10000,
+            enable_reflection=True,
+        )
+        agent.reset()
+        agent.run(instruction=query)
+        
+        response = input("Would you like to provide another query? (y/n): ")
+        if response.lower() != "y":
+            break
+
+if __name__ == '__main__':
+    main()
@@ -1,75 +0,0 @@
-# Author: Saaket Agashe
-# Date: 2021-09-15
-# License: MIT
-
-from lmm_agents.MultimodalAgent import LMMAgent
-import json 
-import pandas as pd
-from collections import Counter
-from tqdm import tqdm 
-from paths import LOG_PATH, DATA_PATH
-from datetime import datetime 
-import os 
-# Debate about a question about an image in the hallusion bench dataset 
-def actor_critic(identifier, n_rounds, actor, critic, prompt, image_path):
-    
-    # Check all the required agents are provided
-    if not actor or not critic:
-        raise ValueError("actor-critic requires both actor and critic agents to be provided")
-
-    conversation_history = []
-    initial_aff_agent_response = None 
-    for round in range(n_rounds):
-        # Start the debate 
-        # Only need to pass the image and set the tone of the conversation (disagreement) in the first turn 
-        if round == 0:
-            # The image and question is passed as input to the affirmative agent
-            actor.add_message(prompt + " Think step by step.", image_content=image_path)
-            conversation_history.append(f"Question: {prompt}")
-            response = actor.get_response()
-            conversation_history.append(f"My answer: {response}")
-            actor.add_message(response)
-
-
-            critic_prompt = f"{prompt}. My answer: {response}. Check if my reasoning is correct. think step by step."
-            critic.add_message(critic_prompt, image_content=image_path)
-            critic_response = critic.get_response()
-            conversation_history.append(f"Critic: {critic_response}")
-            critic.add_message(critic_response)
-
-            if 'Verification: Success'.lower() in critic_response.lower() or 'sucess' in critic_response.lower():
-                break
-        else:
-            # The negative agent's response is passed to the affirmative agent
-            actor.add_message(critic_response)
-            response = actor.get_response()
-            conversation_history.append(f"My answer: {response}")
-            actor.add_message(response)
-
-
-            critic_prompt = f"{prompt}. My updated answer: {response}. Check if my reasoning is correct. Think step by step."
-            critic.add_message(critic_prompt, image_content=image_path)
-            critic_response = critic.get_response()
-            conversation_history.append(f"Critic: {critic_response}")
-            critic.add_message(critic_response)
-
-            if 'Verification: Success'.lower() in critic_response.lower() or 'sucess' in critic_response.lower():
-                break
-            
-            
-    # extract everything after Final Answer from moderator_response 
-    final_answer = response 
-
-    # Save conversation history in a json log file 
-    today = datetime.today()
-    year, day, month = today.year, today.day, today.month
-
-    # makedirs with name blah if they don't exist 
-    os.makedirs(f"{LOG_PATH}/{year}_{day}_{month}", exist_ok=True)
-
-    with open(f"{LOG_PATH}/{year}_{day}_{month}/actor_critic_history_{identifier}.json", "w") as f:
-        json.dump(conversation_history, f)
-
-    return final_answer
-        
-    
@@ -1,164 +0,0 @@
-# Author: Saaket Agashe
-# Date: 2021-09-15
-# License: MIT
-
-from lmm_agents.MultimodalAgent import LMMAgent
-import json 
-import pandas as pd
-from collections import Counter
-from tqdm import tqdm 
-from paths import LOG_PATH, DATA_PATH
-from datetime import datetime 
-import os 
-
-
-def uninterrupted_debate(identifier, n_rounds, aff_agent, neg_agent, moderator, prompt, image_path):
-    # Check all the required agents are provided
-    if not aff_agent or not neg_agent or not moderator:
-        raise ValueError("debate requires an affirmative agent, a negative agent and a moderator to be provided")
-
-    engine_params = {
-        "engine_type": "azure",
-        "model": "gui-agents",
-    }
-
-    conversation_history = []
-    initial_aff_agent_response = None 
-    for round in range(n_rounds):
-        # Start the debate 
-        # Only need to pass the image and set the tone of the conversation (disagreement) in the first turn 
-        if round == 0:
-            # The image and question is passed as input to the affirmative agent
-            aff_agent.add_message(prompt + "", image_content=image_path)
-            conversation_history.append(f"Question: {prompt}")
-            aff_response = aff_agent.get_response()
-            conversation_history.append(f"Affirmative Agent: {aff_response}")
-            aff_agent.add_message(aff_response)
-
-            initial_aff_agent_response = aff_response
-
-            # The affirmative agent's response is passed to the negative agent
-            neg_agent_input = f"{prompt}. My response: {aff_response}. You disagree with me. State your reasons and your answer."
-            neg_agent.add_message(neg_agent_input, image_content=image_path)
-            neg_response = neg_agent.get_response()
-            conversation_history.append(f"Negative Agent: {neg_response}")
-            neg_agent.add_message(neg_response)
-        else:
-            # The negative agent's response is passed to the affirmative agent
-            aff_agent.add_message(neg_response + "")
-            aff_response = aff_agent.get_response()
-            conversation_history.append(f"Affirmative Agent: {aff_response}")
-
-            # The affirmative agent's response is passed to the negative agent
-            neg_agent.add_message(f"{aff_response}. You disagree with me. State your reasons and your answer.")
-            neg_response = neg_agent.get_response()
-            conversation_history.append(f"Negative Agent: {neg_response}")
-
-            if round == n_rounds-1:
-                moderator_message = "\n".join(conversation_history).replace("Negative Agent", "Debater 2").replace("Affirmative Agent", "Debater 1")
-                
-                moderator.add_message(f"{moderator_message}. You must decide an answer this round since its the last round of debate.")
-                moderator_response = moderator.get_response()
-                # moderator.add_message(moderator_response)
-                
-                conversation_history.append(f"Moderator: {moderator_response}")
-    # extract everything after Final Answer from moderator_response 
-    if 'Final Answer' in moderator_response:
-        final_answer = moderator_response.split("Final Answer:")[-1]
-        final_answer = final_answer.strip()
-    else:
-        final_answer = moderator_response 
-
-    # Save conversation history in a json log file 
-    today = datetime.today()
-    year, day, month = today.year, today.day, today.month
-
-    # makedirs with name blah if they don't exist 
-    os.makedirs(f"{LOG_PATH}/{year}_{day}_{month}", exist_ok=True)
-
-    with open(f"{LOG_PATH}/{year}_{day}_{month}/conversation_history_{identifier}.json", "w") as f:
-        json.dump(conversation_history, f)
-
-    return final_answer, initial_aff_agent_response
-
-
-# Debate about a question about an image in the hallusion bench dataset 
-def debate(identifier, n_rounds, aff_agent, neg_agent, moderator, prompt, image_path):
-    
-    # Check all the required agents are provided
-    if not aff_agent or not neg_agent or not moderator:
-        raise ValueError("debate requires an affirmative agent, a negative agent and a moderator to be provided")
-
-    engine_params = {
-        "engine_type": "azure",
-        "model": "gui-agents",
-    }
-
-    conversation_history = []
-    initial_aff_agent_response = None 
-    for round in range(n_rounds):
-        # Start the debate 
-        # Only need to pass the image and set the tone of the conversation (disagreement) in the first turn 
-        if round == 0:
-            # The image and question is passed as input to the affirmative agent
-            aff_agent.add_message(prompt + "", image_content=image_path)
-            conversation_history.append(f"Question: {prompt}")
-            aff_response = aff_agent.get_response()
-            conversation_history.append(f"Affirmative Agent: {aff_response}")
-            aff_agent.add_message(aff_response)
-
-            initial_aff_agent_response = aff_response
-
-            # The affirmative agent's response is passed to the negative agent
-            neg_agent_input = f"{prompt}. My response: {aff_response}. You disagree with me. State your reasons and your answer."
-            neg_agent.add_message(neg_agent_input, image_content=image_path)
-            neg_response = neg_agent.get_response()
-            conversation_history.append(f"Negative Agent: {neg_response}")
-            neg_agent.add_message(neg_response)
-
-            moderator.add_message(f"{prompt}. Debater 1 argues: {aff_response}. Debater 2 argues: {neg_response}.", image_content=image_path)
-            moderator_response = moderator.get_response()
-            moderator.add_message(moderator_response)
-            conversation_history.append(f"Moderator: {moderator_response}")
-
-            if 'Final Answer' in moderator_response:
-                break
-        else:
-            # The negative agent's response is passed to the affirmative agent
-            aff_agent.add_message(neg_response + "")
-            aff_response = aff_agent.get_response()
-            conversation_history.append(f"Affirmative Agent: {aff_response}")
-
-            # The affirmative agent's response is passed to the negative agent
-            neg_agent.add_message(f"{aff_response}. You disagree with me. State your reasons and your answer.")
-            neg_response = neg_agent.get_response()
-            conversation_history.append(f"Negative Agent: {neg_response}")
-
-            if round == n_rounds-1:
-                moderator.add_message(f"{prompt}. Debater 1 argues: {aff_response}. Debater 2 argues: {neg_response}. You must decide an answer this round since its the last round of debate.")
-            else:
-                moderator.add_message(f"{prompt}. Debater 1 argues: {aff_response}. Debater 2 argues: {neg_response}.")
-
-            moderator_response = moderator.get_response()
-            moderator.add_message(moderator_response)
-            conversation_history.append(f"Moderator: {moderator_response}")
-
-            if 'Final Answer' in moderator_response:
-                break
-    # extract everything after Final Answer from moderator_response 
-    final_answer = moderator_response.split("Final Answer:")[-1]
-    final_answer = final_answer.strip()
-
-    # Save conversation history in a json log file 
-    today = datetime.today()
-    year, day, month = today.year, today.day, today.month
-
-    # makedirs with name blah if they don't exist 
-    os.makedirs(f"{LOG_PATH}/{year}_{day}_{month}", exist_ok=True)
-
-    with open(f"{LOG_PATH}/{year}_{day}_{month}/conversation_history_{identifier}.json", "w") as f:
-        json.dump(conversation_history, f)
-
-    return final_answer, initial_aff_agent_response
-        
-    
@@ -1,257 +0,0 @@
-# Author: Saaket Agashe
-# Date: 2021-09-15
-# License: MIT
-
-# Standard Imports 
-from collections import Counter 
-import json 
-from tqdm import tqdm
-import os 
-import pandas as pd 
-from datetime import datetime
-from PIL import Image, ImageDraw
-
-# Custom Imports 
-from debate import debate, debate2
-from actor_critic import actor_critic
-from lmm_agents.MultimodalAgent import LMMAgent
-from prompts import DEBATER_SYSTEM_PROMPT, MODERATOR_SYSTEM_PROMPT, ACTOR_PROMPT, CRITIC_PROMPT, MODERATOR_FINAL_PROMPT
-from paths import LOG_PATH, DATA_PATH
-from llava.model.builder import load_pretrained_model
-from llava.mm_utils import (
-    get_model_name_from_path,
-)
-# Set to true for debate before answering 
-
-
-experiment_config = {
-    'strategy': 'debate', # debate, actor_critic, baseline
-    'n_rounds': 2,
-    'model_type': 'azure'
-}
-
-random_sels = ['pi',
- 'DC_metro',
- 'phone_sales',
- 'population growth',
- 'simpson',
- 'math_prob',
- 'population',
- 'china_export_us',
- 'teen_population',
- 'para_angle',
- 'teen_population',
- 'math_prob',
- 'world_war2',
- 'sqrt2',
- 'duck',
- 'teen_population',
- 'line',
- 'illusion',
- 'Red Velvet',
- 'math_prob',
- 'usmap',
- 'central_bank',
- 'flow',
- 'line',
- 'circle',
- 'square',
- 'Kennedy',
- 'Berlin',
- 'NBA',
- 'parking']
-
-
-
-def visualize():
-        # Load the hallusionbench data
-    hallusion_path = '/data4/saaket/hallusion_bench/'
-    data_file = os.path.join(hallusion_path, 'HallusionBench.json')
-    with open(data_file, 'r') as f:
-        data = json.load(f)
-
-    df = pd.DataFrame(data)
-
-    #### DATA FILTERING #####
-
-    # What is the size of the df and the distribution of categories and subcategories
-    print(len(df))
-    print(Counter(df['category']))
-    print(Counter(df['subcategory']))
-
-
-    # Let's only keep the visual dependent examples (VD)
-    df_vd = df.loc[df['category'] == 'VD']
-
-    # This code will sample n_samples examples from each subcategory
-    n_samples = 5
-    random_seed = 42  # Set the seed for reproducibility
-
-    df_vd_illusion_only = df_vd[df_vd['subcategory'] == 'illusion']
-    df_vd_illusion_selected_sample_keys = df_vd_illusion_only[df_vd_illusion_only['sample_note'].isin(['circle', 'box', 'line', 'rail', 'grey_dot'])]
-    sample_df = df_vd_illusion_selected_sample_keys.groupby('sample_note', group_keys=False).apply(lambda x: x.sample(min(len(x), n_samples), random_state=random_seed))
-
-
-    #### VISUALIZATION #####
-    answers = []
-    sample_df = sample_df.reset_index()
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    for index, row in tqdm(sample_df.iterrows()):
-        # print(row['question'])
-        # if index == 20:
-        #     break 
-        question = row['question']
-        filename = row['filename']
-        image_path = hallusion_path + filename
-        
-        # Open image, write the question on the bottom of the image and save
-        img = Image.open(image_path)
-        draw = ImageDraw.Draw(img)
-        draw.text((10, 10), question, (255, 255, 255))
-        img.save(f"{LOG_PATH}/{question}_{index}.png")
-
-def main():
-
-    # Load the hallusionbench data
-    hallusion_path = '/data4/saaket/hallusion_bench/'
-    data_file = os.path.join(hallusion_path, 'HallusionBench.json')
-    # data_file = 'got_these_wrong_before_vd.json'
-    with open(data_file, 'r') as f:
-        data = json.load(f)
-
-    df = pd.DataFrame(data)
-
-    #### DATA FILTERING #####
-
-    # What is the size of the df and the distribution of categories and subcategories
-    print(len(df))
-    print(Counter(df['category']))
-    print(Counter(df['subcategory']))
-
-
-    # Let's only keep the visual dependent examples (VD)
-    # df_vd = df.loc[df['category'] == 'VD']
-    # df_vs = df.loc[df['category'] == 'VS']
-    # # This code will sample n_samples examples from each subcategory
-    # n_samples = 5
-    # random_seed = 42  # Set the seed for reproducibility
-
-    # df_vd_illusion_only = df_vd[df_vd['subcategory'] == 'illusion']
-    # print(Counter(df_vd_illusion_only['sample_note']))
-    # df_vd_illusion_selected_sample_keys = df_vd_illusion_only[df_vd_illusion_only['sample_note'].isin(['circle', 'box', 'line', 'rail', 'grey_dot'])]
-    # df_vd_illusion_selected_sample_keys = df_vd_illusion_only[~df_vd_illusion_only['sample_note'].isin(['circle', 'line'])]
-    # # sample_df = df_vd_illusion_selected_sample_keys.groupby('sample_note', group_keys=False).apply(lambda x: x.sample(min(len(x), n_samples), random_state=random_seed))
-    # print(Counter(df_vd_illusion_selected_sample_keys['sample_note']))
-    # sample_df = df_vd_illusion_selected_sample_keys
-    sample_df = df[df['sample_note'].isin(random_sels)]
-    print(sample_df.head())
-    print(len(sample_df))
-    print(Counter(sample_df['category']))
-    #### AGENT INITIALIZATION #####
-    if experiment_config['model_type'] == 'llava':
-        engine_params = {
-            "engine_type": 'llava',
-            'model_path': 'liuhaotian/llava-v1.5-7b',
-        }
-        
-        tokenizer, model, image_processor, context_len = load_pretrained_model(
-                engine_params['model_path'], None, get_model_name_from_path(engine_params['model_path']))
-        engine_params['tokenizer'] = tokenizer 
-        engine_params['model'] = model 
-        engine_params['image_processor'] = image_processor
-        engine_params['context_len'] = context_len
-   
-    elif experiment_config['model_type'] == 'cogvlm':
-        engine_params = {
-            'engine_type': 'cogvlm',
-            'model_path': "THUDM/cogvlm2-llama3-chat-19B"
-        }
-        
-    else:
-        engine_params = {
-            "engine_type": "azure",
-            "model": "guiagents",
-            "api_version": "2023-12-01-preview"
-        }
-
-    #### AGENT WORKFLOW #####
-    answers = []
-    baseline_answers = []
-    sample_df = sample_df.reset_index()
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    
-    
-    
-    for index, row in tqdm(sample_df.iterrows(), total=sample_df.shape[0]):
-        # print(row['question'])
-        # if index == 20:
-        #     break 
-        # if index < 1066:
-        #     continue
-        question = row['question']
-        filename = row['filename']
-        if filename:
-            image_path = hallusion_path + filename
-        else:
-            image_path = None 
-        # Log the values to visualize what is happening
-        print(f"Generating answer for question: {question}")
-        print(f"Using image path: {image_path}")
-        
-        # Generate the answer using the engine
-        
-        if experiment_config['strategy'] == 'debate':
-            identifier = f"{index}_{question[:10]}"
-            aff_agent = LMMAgent(engine_params=engine_params, system_prompt=DEBATER_SYSTEM_PROMPT)
-            neg_agent = LMMAgent(engine_params=engine_params, system_prompt=DEBATER_SYSTEM_PROMPT)
-            moderator = LMMAgent(engine_params=engine_params, system_prompt=MODERATOR_FINAL_PROMPT)
-            answer, baseline_answer = debate2(identifier=identifier,
-                         n_rounds=experiment_config['n_rounds'],
-                         aff_agent=aff_agent, 
-                         neg_agent=neg_agent,
-                         moderator=moderator,
-                         prompt=question,
-                         image_path=image_path)
-        elif experiment_config['strategy'] == 'actor_critic':
-            identifier = f"{index}_{question[:10]}"
-            actor = LMMAgent(engine_params=engine_params, system_prompt=ACTOR_PROMPT)
-            critic = LMMAgent(engine_params=engine_params, system_prompt=CRITIC_PROMPT)
-            answer = actor_critic(identifier=identifier, 
-                                  n_rounds=2,
-                                  actor=actor, 
-                                  critic=critic, 
-                                  prompt=question, 
-                                  image_path=image_path)
-        else:
-            agent = LMMAgent(engine_params=engine_params, system_prompt=DEBATER_SYSTEM_PROMPT)
-            agent.add_message(question, image_content=image_path)
-            answer = agent.get_response()
-        
-        
-        # Log the generated answer
-        print(f"Generated answer: {answer}")
-
-        # Collect the generated answer 
-        answers.append(answer)
-        
-        if experiment_config['strategy'] == 'debate':
-            baseline_answers.append(baseline_answer)
-
-        # Save the answers to a unique file
-
-        # Save conversation history in a json log file 
-        today = datetime.today()
-        year, day, month = today.year, today.day, today.month
-
-        # makedirs with name blah if they don't exist 
-        os.makedirs(f"{LOG_PATH}/{year}_{day}_{month}", exist_ok=True)
-        
-        with open(f'{LOG_PATH}/{year}_{day}_{month}/answers_debate_{timestamp}.json', 'w') as f:
-            json.dump(answers, f)
-
-        with open(f'{LOG_PATH}/{year}_{day}_{month}/baseline_answers_{timestamp}.json', 'w') as f:
-            json.dump(baseline_answers, f)
-
-if __name__ == '__main__':
-    main()
-    # visualize()
@@ -1,8 +0,0 @@
-from PIL import Image
-
-# Open an image file
-with Image.open('/data4/saaket/hallusion_bench/VS/map/0_1.png') as img:
-    # Get the size of the image
-    width, height = img.size
-
-print(f'The image size is {width} x {height} pixels.')
@@ -1,201 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Imports \n",
-    "\n",
-    "import json\n",
-    "import os\n",
-    "import random\n",
-    "import time\n",
-    "from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union\n",
-    "\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "import requests\n",
-    "from PIL import Image\n",
-    "from termcolor import colored\n",
-    "\n",
-    "import autogen\n",
-    "from autogen import Agent, AssistantAgent, ConversableAgent, UserProxyAgent\n",
-    "from autogen.agentchat.contrib.capabilities.vision_capability import VisionCapability\n",
-    "from autogen.agentchat.contrib.img_utils import get_pil_image, pil_to_data_uri\n",
-    "from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent\n",
-    "from autogen.code_utils import content_str\n",
-    "from datasets import load_dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c2a451a74eba4e51921c89af014dda68",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading readme:   0%|          | 0.00/5.20k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "65799054c25249de817f1492a2ab073c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "39d400fa641947ec82773bf8753e91f5",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0/27 [00:00<?, ?files/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 7\u001b[0m\n\u001b[1;32m      2\u001b[0m os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mHF_HOME\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/data4/saaket/cache\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m      3\u001b[0m os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mHF_DATASETS_CACHE\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/data4/saaket/cache\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 7\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mosunlp/Multimodal-Mind2Web\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/load.py:2609\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[1;32m   2606\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m builder_instance\u001b[38;5;241m.\u001b[39mas_streaming_dataset(split\u001b[38;5;241m=\u001b[39msplit)\n\u001b[1;32m   2608\u001b[0m \u001b[38;5;66;03m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 2609\u001b[0m \u001b[43mbuilder_instance\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2610\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2611\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2612\u001b[0m \u001b[43m    \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2613\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2614\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2615\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2617\u001b[0m \u001b[38;5;66;03m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m   2618\u001b[0m keep_in_memory \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   2619\u001b[0m     keep_in_memory \u001b[38;5;28;01mif\u001b[39;00m keep_in_memory \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m is_small_dataset(builder_instance\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size)\n\u001b[1;32m   2620\u001b[0m )\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/builder.py:1027\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m   1025\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1026\u001b[0m         prepare_split_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_proc\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_proc\n\u001b[0;32m-> 1027\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1028\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1029\u001b[0m \u001b[43m        \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1030\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1031\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdownload_and_prepare_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1032\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1033\u001b[0m \u001b[38;5;66;03m# Sync info\u001b[39;00m\n\u001b[1;32m   1034\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(split\u001b[38;5;241m.\u001b[39mnum_bytes \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39msplits\u001b[38;5;241m.\u001b[39mvalues())\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/builder.py:1100\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m   1098\u001b[0m split_dict \u001b[38;5;241m=\u001b[39m SplitDict(dataset_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset_name)\n\u001b[1;32m   1099\u001b[0m split_generators_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[0;32m-> 1100\u001b[0m split_generators \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_split_generators\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msplit_generators_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1102\u001b[0m \u001b[38;5;66;03m# Checksums verification\u001b[39;00m\n\u001b[1;32m   1103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verification_mode \u001b[38;5;241m==\u001b[39m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS \u001b[38;5;129;01mand\u001b[39;00m dl_manager\u001b[38;5;241m.\u001b[39mrecord_checksums:\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/packaged_modules/parquet/parquet.py:44\u001b[0m, in \u001b[0;36mParquet._split_generators\u001b[0;34m(self, dl_manager)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAt least one data file must be specified, but got data_files=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mdata_files\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     43\u001b[0m dl_manager\u001b[38;5;241m.\u001b[39mdownload_config\u001b[38;5;241m.\u001b[39mextract_on_the_fly \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m---> 44\u001b[0m data_files \u001b[38;5;241m=\u001b[39m \u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_extract\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     45\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_files, (\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m)):\n\u001b[1;32m     46\u001b[0m     files \u001b[38;5;241m=\u001b[39m data_files\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/download/download_manager.py:434\u001b[0m, in \u001b[0;36mDownloadManager.download_and_extract\u001b[0;34m(self, url_or_urls)\u001b[0m\n\u001b[1;32m    418\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdownload_and_extract\u001b[39m(\u001b[38;5;28mself\u001b[39m, url_or_urls):\n\u001b[1;32m    419\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Download and extract given `url_or_urls`.\u001b[39;00m\n\u001b[1;32m    420\u001b[0m \n\u001b[1;32m    421\u001b[0m \u001b[38;5;124;03m    Is roughly equivalent to:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    432\u001b[0m \u001b[38;5;124;03m        extracted_path(s): `str`, extracted paths of given URL(s).\u001b[39;00m\n\u001b[1;32m    433\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 434\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextract(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl_or_urls\u001b[49m\u001b[43m)\u001b[49m)\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/download/download_manager.py:257\u001b[0m, in \u001b[0;36mDownloadManager.download\u001b[0;34m(self, url_or_urls)\u001b[0m\n\u001b[1;32m    255\u001b[0m start_time \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mnow()\n\u001b[1;32m    256\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m stack_multiprocessing_download_progress_bars():\n\u001b[0;32m--> 257\u001b[0m     downloaded_path_or_paths \u001b[38;5;241m=\u001b[39m \u001b[43mmap_nested\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    258\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdownload_func\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    259\u001b[0m \u001b[43m        \u001b[49m\u001b[43murl_or_urls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    260\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmap_tuple\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    261\u001b[0m \u001b[43m        \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    262\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDownloading data files\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    263\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbatched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    264\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    265\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    266\u001b[0m duration \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mnow() \u001b[38;5;241m-\u001b[39m start_time\n\u001b[1;32m    267\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDownloading took \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mduration\u001b[38;5;241m.\u001b[39mtotal_seconds()\u001b[38;5;250m \u001b[39m\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;241m60\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m min\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/utils/py_utils.py:491\u001b[0m, in \u001b[0;36mmap_nested\u001b[0;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, batched, batch_size, types, disable_tqdm, desc)\u001b[0m\n\u001b[1;32m    489\u001b[0m     num_proc \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(v, types) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(v) \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m iterable):\n\u001b[0;32m--> 491\u001b[0m     mapped \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    492\u001b[0m         map_nested(\n\u001b[1;32m    493\u001b[0m             function\u001b[38;5;241m=\u001b[39mfunction,\n\u001b[1;32m    494\u001b[0m             data_struct\u001b[38;5;241m=\u001b[39mobj,\n\u001b[1;32m    495\u001b[0m             num_proc\u001b[38;5;241m=\u001b[39mnum_proc,\n\u001b[1;32m    496\u001b[0m             parallel_min_length\u001b[38;5;241m=\u001b[39mparallel_min_length,\n\u001b[1;32m    497\u001b[0m             batched\u001b[38;5;241m=\u001b[39mbatched,\n\u001b[1;32m    498\u001b[0m             batch_size\u001b[38;5;241m=\u001b[39mbatch_size,\n\u001b[1;32m    499\u001b[0m             types\u001b[38;5;241m=\u001b[39mtypes,\n\u001b[1;32m    500\u001b[0m         )\n\u001b[1;32m    501\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[1;32m    502\u001b[0m     ]\n\u001b[1;32m    503\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m num_proc \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m num_proc \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m<\u001b[39m parallel_min_length:\n\u001b[1;32m    504\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m batched:\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/utils/py_utils.py:492\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    489\u001b[0m     num_proc \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(v, types) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(v) \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m iterable):\n\u001b[1;32m    491\u001b[0m     mapped \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 492\u001b[0m         \u001b[43mmap_nested\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    493\u001b[0m \u001b[43m            \u001b[49m\u001b[43mfunction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    494\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdata_struct\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    495\u001b[0m \u001b[43m            \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    496\u001b[0m \u001b[43m            \u001b[49m\u001b[43mparallel_min_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparallel_min_length\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    497\u001b[0m \u001b[43m            \u001b[49m\u001b[43mbatched\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatched\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    498\u001b[0m \u001b[43m            \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    499\u001b[0m \u001b[43m            \u001b[49m\u001b[43mtypes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    500\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    501\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable\n\u001b[1;32m    502\u001b[0m     ]\n\u001b[1;32m    503\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m num_proc \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m num_proc \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m<\u001b[39m parallel_min_length:\n\u001b[1;32m    504\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m batched:\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/utils/py_utils.py:508\u001b[0m, in \u001b[0;36mmap_nested\u001b[0;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, batched, batch_size, types, disable_tqdm, desc)\u001b[0m\n\u001b[1;32m    506\u001b[0m         batch_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(\u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m num_proc \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m%\u001b[39m num_proc \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m), \u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m    507\u001b[0m     iterable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(iter_batched(iterable, batch_size))\n\u001b[0;32m--> 508\u001b[0m mapped \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m    509\u001b[0m     _single_map_nested((function, obj, batched, batch_size, types, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m    510\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m hf_tqdm(iterable, disable\u001b[38;5;241m=\u001b[39mdisable_tqdm, desc\u001b[38;5;241m=\u001b[39mdesc)\n\u001b[1;32m    511\u001b[0m ]\n\u001b[1;32m    512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m batched:\n\u001b[1;32m    513\u001b[0m     mapped \u001b[38;5;241m=\u001b[39m [mapped_item \u001b[38;5;28;01mfor\u001b[39;00m mapped_batch \u001b[38;5;129;01min\u001b[39;00m mapped \u001b[38;5;28;01mfor\u001b[39;00m mapped_item \u001b[38;5;129;01min\u001b[39;00m mapped_batch]\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/utils/py_utils.py:509\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    506\u001b[0m         batch_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(\u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m num_proc \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m%\u001b[39m num_proc \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m), \u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m    507\u001b[0m     iterable \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(iter_batched(iterable, batch_size))\n\u001b[1;32m    508\u001b[0m mapped \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 509\u001b[0m     \u001b[43m_single_map_nested\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatched\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    510\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m hf_tqdm(iterable, disable\u001b[38;5;241m=\u001b[39mdisable_tqdm, desc\u001b[38;5;241m=\u001b[39mdesc)\n\u001b[1;32m    511\u001b[0m ]\n\u001b[1;32m    512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m batched:\n\u001b[1;32m    513\u001b[0m     mapped \u001b[38;5;241m=\u001b[39m [mapped_item \u001b[38;5;28;01mfor\u001b[39;00m mapped_batch \u001b[38;5;129;01min\u001b[39;00m mapped \u001b[38;5;28;01mfor\u001b[39;00m mapped_item \u001b[38;5;129;01min\u001b[39;00m mapped_batch]\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/utils/py_utils.py:377\u001b[0m, in \u001b[0;36m_single_map_nested\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m    370\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m function(data_struct)\n\u001b[1;32m    371\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    372\u001b[0m     batched\n\u001b[1;32m    373\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, \u001b[38;5;28mdict\u001b[39m)\n\u001b[1;32m    374\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, types)\n\u001b[1;32m    375\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, types) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m data_struct)\n\u001b[1;32m    376\u001b[0m ):\n\u001b[0;32m--> 377\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m [mapped_item \u001b[38;5;28;01mfor\u001b[39;00m batch \u001b[38;5;129;01min\u001b[39;00m iter_batched(data_struct, batch_size) \u001b[38;5;28;01mfor\u001b[39;00m mapped_item \u001b[38;5;129;01min\u001b[39;00m function(batch)]\n\u001b[1;32m    379\u001b[0m \u001b[38;5;66;03m# Reduce logging to keep things readable in multiprocessing with tqdm\u001b[39;00m\n\u001b[1;32m    380\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m rank \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m<\u001b[39m logging\u001b[38;5;241m.\u001b[39mWARNING:\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/utils/py_utils.py:377\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    370\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m function(data_struct)\n\u001b[1;32m    371\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m    372\u001b[0m     batched\n\u001b[1;32m    373\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, \u001b[38;5;28mdict\u001b[39m)\n\u001b[1;32m    374\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, types)\n\u001b[1;32m    375\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v, types) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m data_struct)\n\u001b[1;32m    376\u001b[0m ):\n\u001b[0;32m--> 377\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m [mapped_item \u001b[38;5;28;01mfor\u001b[39;00m batch \u001b[38;5;129;01min\u001b[39;00m iter_batched(data_struct, batch_size) \u001b[38;5;28;01mfor\u001b[39;00m mapped_item \u001b[38;5;129;01min\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43mbatch\u001b[49m\u001b[43m)\u001b[49m]\n\u001b[1;32m    379\u001b[0m \u001b[38;5;66;03m# Reduce logging to keep things readable in multiprocessing with tqdm\u001b[39;00m\n\u001b[1;32m    380\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m rank \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m<\u001b[39m logging\u001b[38;5;241m.\u001b[39mWARNING:\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/datasets/download/download_manager.py:300\u001b[0m, in \u001b[0;36mDownloadManager._download_batched\u001b[0;34m(self, url_or_filenames, download_config)\u001b[0m\n\u001b[1;32m    295\u001b[0m         \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m    296\u001b[0m     max_workers \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    297\u001b[0m         config\u001b[38;5;241m.\u001b[39mHF_DATASETS_MULTITHREADING_MAX_WORKERS \u001b[38;5;28;01mif\u001b[39;00m size \u001b[38;5;241m<\u001b[39m (\u001b[38;5;241m20\u001b[39m \u001b[38;5;241m<<\u001b[39m \u001b[38;5;241m20\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    298\u001b[0m     )  \u001b[38;5;66;03m# enable multithreading if files are small\u001b[39;00m\n\u001b[0;32m--> 300\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mthread_map\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    301\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdownload_func\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    302\u001b[0m \u001b[43m        \u001b[49m\u001b[43murl_or_filenames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    303\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_desc\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDownloading\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    304\u001b[0m \u001b[43m        \u001b[49m\u001b[43munit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfiles\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m    305\u001b[0m \u001b[43m        \u001b[49m\u001b[43mposition\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmultiprocessing\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_process\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_identity\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# contains the ranks of subprocesses\u001b[39;49;00m\n\u001b[1;32m    306\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43menviron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mHF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m1\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m    307\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;129;43;01mand\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mmultiprocessing\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcurrent_process\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_identity\u001b[49m\n\u001b[1;32m    308\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    309\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_workers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    310\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtqdm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    311\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    312\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    313\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m [\n\u001b[1;32m    314\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_download_single(url_or_filename, download_config\u001b[38;5;241m=\u001b[39mdownload_config)\n\u001b[1;32m    315\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m url_or_filename \u001b[38;5;129;01min\u001b[39;00m url_or_filenames\n\u001b[1;32m    316\u001b[0m     ]\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:69\u001b[0m, in \u001b[0;36mthread_map\u001b[0;34m(fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     56\u001b[0m \u001b[38;5;124;03mEquivalent of `list(map(fn, *iterables))`\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;124;03mdriven by `concurrent.futures.ThreadPoolExecutor`.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     66\u001b[0m \u001b[38;5;124;03m    [default: max(32, cpu_count() + 4)].\u001b[39;00m\n\u001b[1;32m     67\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     68\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconcurrent\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfutures\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ThreadPoolExecutor\n\u001b[0;32m---> 69\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_executor_map\u001b[49m\u001b[43m(\u001b[49m\u001b[43mThreadPoolExecutor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mtqdm_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/tqdm/contrib/concurrent.py:51\u001b[0m, in \u001b[0;36m_executor_map\u001b[0;34m(PoolExecutor, fn, *iterables, **tqdm_kwargs)\u001b[0m\n\u001b[1;32m     47\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ensure_lock(tqdm_class, lock_name\u001b[38;5;241m=\u001b[39mlock_name) \u001b[38;5;28;01mas\u001b[39;00m lk:\n\u001b[1;32m     48\u001b[0m     \u001b[38;5;66;03m# share lock in case workers are already using `tqdm`\u001b[39;00m\n\u001b[1;32m     49\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m PoolExecutor(max_workers\u001b[38;5;241m=\u001b[39mmax_workers, initializer\u001b[38;5;241m=\u001b[39mtqdm_class\u001b[38;5;241m.\u001b[39mset_lock,\n\u001b[1;32m     50\u001b[0m                       initargs\u001b[38;5;241m=\u001b[39m(lk,)) \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m---> 51\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtqdm_class\u001b[49m\u001b[43m(\u001b[49m\u001b[43mex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43miterables\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunksize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunksize\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/tqdm/notebook.py:254\u001b[0m, in \u001b[0;36mtqdm_notebook.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    252\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    253\u001b[0m     it \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msuper\u001b[39m(tqdm_notebook, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__iter__\u001b[39m()\n\u001b[0;32m--> 254\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m it:\n\u001b[1;32m    255\u001b[0m         \u001b[38;5;66;03m# return super(tqdm...) will not catch exception\u001b[39;00m\n\u001b[1;32m    256\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m obj\n\u001b[1;32m    257\u001b[0m \u001b[38;5;66;03m# NB: except ... [ as ...] breaks IPython async KeyboardInterrupt\u001b[39;00m\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/site-packages/tqdm/std.py:1178\u001b[0m, in \u001b[0;36mtqdm.__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1175\u001b[0m time \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_time\n\u001b[1;32m   1177\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1178\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m iterable:\n\u001b[1;32m   1179\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m obj\n\u001b[1;32m   1180\u001b[0m         \u001b[38;5;66;03m# Update and possibly print the progressbar.\u001b[39;00m\n\u001b[1;32m   1181\u001b[0m         \u001b[38;5;66;03m# Note: does not call self.update(1) for speed optimisation.\u001b[39;00m\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/concurrent/futures/_base.py:609\u001b[0m, in \u001b[0;36mExecutor.map.<locals>.result_iterator\u001b[0;34m()\u001b[0m\n\u001b[1;32m    606\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m fs:\n\u001b[1;32m    607\u001b[0m     \u001b[38;5;66;03m# Careful not to keep a reference to the popped future\u001b[39;00m\n\u001b[1;32m    608\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 609\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m \u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpop\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    610\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    611\u001b[0m         \u001b[38;5;28;01myield\u001b[39;00m fs\u001b[38;5;241m.\u001b[39mpop()\u001b[38;5;241m.\u001b[39mresult(end_time \u001b[38;5;241m-\u001b[39m time\u001b[38;5;241m.\u001b[39mmonotonic())\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/concurrent/futures/_base.py:441\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    438\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[1;32m    439\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m__get_result()\n\u001b[0;32m--> 441\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_condition\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    443\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n\u001b[1;32m    444\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n",
-      "File \u001b[0;32m~/.conda/envs/llm_exp/lib/python3.9/threading.py:312\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    310\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:    \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m    311\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 312\u001b[0m         \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    313\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m    314\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    }
-   ],
-   "source": [
-    "os.environ['TRANSFORMERS_CACHE'] = '/data4/saaket/cache'\n",
-    "os.environ['HF_HOME'] = '/data4/saaket/cache'\n",
-    "os.environ['HF_DATASETS_CACHE'] = '/data4/saaket/cache'\n",
-    "\n",
-    "\n",
-    "\n",
-    "dataset = load_dataset(\"osunlp/Multimodal-Mind2Web\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "lmm_config = {\n",
-    "    \"config_list\": [\n",
-    "        {\n",
-    "            \"model\": \"guiagents\",\n",
-    "            \"api_key\": os.environ.get(\"AZURE_OPENAI_API_KEY\"),\n",
-    "            \"api_type\": \"azure\",\n",
-    "            \"base_url\": os.environ.get(\"AZURE_OPENAI_API_BASE\"),\n",
-    "            \"api_version\": \"2023-12-01-preview\",\n",
-    "        },\n",
-    "    ],\n",
-    "    \"temperature\": 0.5,\n",
-    "    \"timeout\": 300,\n",
-    "}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_agent = MultimodalConversableAgent(\n",
-    "    name=\"image-explainer\",\n",
-    "    max_consecutive_auto_reply=10,\n",
-    "    llm_config=lmm_config,\n",
-    ")\n",
-    "\n",
-    "user_proxy = autogen.UserProxyAgent(\n",
-    "    name=\"User_proxy\",\n",
-    "    system_message=\"A human admin.\",\n",
-    "    human_input_mode=\"NEVER\",  # Try between ALWAYS or NEVER\n",
-    "    max_consecutive_auto_reply=0,\n",
-    "    code_execution_config={\n",
-    "        \"use_docker\": False\n",
-    "    },  # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Ask the question with an image\n",
-    "user_proxy.initiate_chat(\n",
-    "    image_agent,\n",
-    "    message=\"\"\"What's the breed of this dog?\n",
-    "<img https://th.bing.com/th/id/R.422068ce8af4e15b0634fe2540adea7a?rik=y4OcXBE%2fqutDOw&pid=ImgRaw&r=0>.\"\"\",\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "llm_exp",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
@@ -1,998 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# p0 = '/home/saaket/lmm-agents/logs/2024_4_6/answers_debate_20240604_152825.json'\n",
-    "# p1 = '/home/saaket/lmm-agents/logs/2024_4_6/answers_debate_20240604_195043.json'\n",
-    "# p2 = '/home/saaket/lmm-agents/logs/2024_5_6/answers_debate_20240604_195043.json'\n",
-    "# p3 = '/home/saaket/lmm-agents/logs/2024_5_6/answers_debate_20240605_113515.json'\n",
-    "s0 = '/home/saaket/lmm-agents/logs/2024_4_6/answers_debate_20240604_152825.json'\n",
-    "s1 = '/home/saaket/lmm-agents/logs/2024_5_6/answers_debate_20240604_195043.json'\n",
-    "s2 = '/home/saaket/lmm-agents/logs/2024_5_6/answers_debate_20240605_113515.json'\n",
-    "\n",
-    "b0 = '/home/saaket/lmm-agents/logs/2024_4_6/baseline_answers_20240604_152825.json'\n",
-    "b1 = '/home/saaket/lmm-agents/logs/2024_5_6/baseline_answers_20240604_195043.json'\n",
-    "b2 = '/home/saaket/lmm-agents/logs/2024_5_6/baseline_answers_20240605_113515.json'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json \n",
-    "def open_log_file(file):\n",
-    "    with open(file, 'r') as f:\n",
-    "        data = json.load(f)\n",
-    "    return data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(open_log_file(s0)) + len(open_log_file(s1)) + len(open_log_file(s2))\n",
-    "\n",
-    "debate_answers = open_log_file(s0) + open_log_file(s1) + open_log_file(s2)\n",
-    "baseline_answers = open_log_file(b0) + open_log_file(b1) + open_log_file(b2)\n",
-    "\n",
-    "assert len(debate_answers) == len(baseline_answers)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd \n",
-    "import os \n",
-    "def compile_in_hallusion_format():\n",
-    "    hallusion_path = '/data4/saaket/hallusion_bench/'\n",
-    "    data_file = os.path.join(hallusion_path, 'HallusionBench.json')\n",
-    "    with open(data_file, 'r') as f:\n",
-    "        data = json.load(f)\n",
-    "    df = pd.DataFrame(data)\n",
-    "\n",
-    "    # drop entries from 1055 to 1065 (both inclusive) \n",
-    "    df = df.drop(df.index[1054:1065])\n",
-    "    print(len(df))\n",
-    "    # check if length of df is equal to length of debate_answers\n",
-    "    debate_answers_df = df.copy()\n",
-    "    baseline_answers_df = df.copy()\n",
-    "\n",
-    "    debate_answers_df['model_prediction'] = debate_answers\n",
-    "    baseline_answers_df['model_prediction'] = baseline_answers\n",
-    "\n",
-    "    debate_answers_df = debate_answers_df.drop(debate_answers_df.index[1053:1054])\n",
-    "    baseline_answers_df = baseline_answers_df.drop(baseline_answers_df.index[1053:1054])\n",
-    "\n",
-    "    # make dirs\n",
-    "    os.makedirs('output/gpt4v/debate_baseline', exist_ok=True)\n",
-    "    os.makedirs('output/gpt4v/debate', exist_ok=True)\n",
-    "\n",
-    "    debate_answers_df.to_json('output/gpt4v/debate/HallusionBench_result.json', orient='records')\n",
-    "    baseline_answers_df.to_json('output/gpt4v/debate_baseline/HallusionBench_result.json', orient='records')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1118\n"
-     ]
-    }
-   ],
-   "source": [
-    "compile_in_hallusion_format()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "all_answers = open_log_file(p0) + open_log_file(p1) + open_log_file(p2) + open_log_file(p3)\n",
-    "\n",
-    "# drop duplicates without destroying order\n",
-    "seen = set()\n",
-    "answers = [] \n",
-    "for answer in all_answers:\n",
-    "    if answer not in seen:\n",
-    "        answers.append(answer)\n",
-    "        seen.add(answer)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "933"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(answers)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd \n",
-    "\n",
-    "df = pd.read_json('/data4/saaket/hallusion_bench/HallusionBench.json')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Counter({0: 23, 1: 23, 2: 9})"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from collections import Counter \n",
-    "Counter(df[df['set_id'] == 0]['figure_id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "unique_figure_columns = str(df['set_id']) + \"_\" + str(df['figure_id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['unique_figure_columns'] = df['set_id'].astype(str) + \"_\" + df['figure_id'].astype(str) + \"_\" + df['sample_note'].astype(str)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['unique_figure_columns'] = unique_figure_columns "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>category</th>\n",
-       "      <th>subcategory</th>\n",
-       "      <th>visual_input</th>\n",
-       "      <th>set_id</th>\n",
-       "      <th>figure_id</th>\n",
-       "      <th>sample_note</th>\n",
-       "      <th>question_id</th>\n",
-       "      <th>question</th>\n",
-       "      <th>gt_answer_details</th>\n",
-       "      <th>gt_answer</th>\n",
-       "      <th>filename</th>\n",
-       "      <th>unique_figure_columns</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Is China, Hongkong SAR, the leading importing ...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Is Switzerland, the leading importing country ...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Is France, the leading importing country of go...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Is the United States, the leading importing co...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1</td>\n",
-       "      <td>import</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Is China, Hongkong SAR, the leading importing ...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>./VS/chart/0_1.png</td>\n",
-       "      <td>0_1_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1124</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>19</td>\n",
-       "      <td>1</td>\n",
-       "      <td>running</td>\n",
-       "      <td>3</td>\n",
-       "      <td>He is running clockwise. According to the posi...</td>\n",
-       "      <td>The man is running clockwise.</td>\n",
-       "      <td>1</td>\n",
-       "      <td>./VD/video/19_1.png</td>\n",
-       "      <td>19_1_running</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1125</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>19</td>\n",
-       "      <td>2</td>\n",
-       "      <td>running</td>\n",
-       "      <td>0</td>\n",
-       "      <td>According to the positive sequence of the imag...</td>\n",
-       "      <td>The man is running counterclockwise.</td>\n",
-       "      <td>1</td>\n",
-       "      <td>./VD/video/19_2.png</td>\n",
-       "      <td>19_2_running</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1126</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>19</td>\n",
-       "      <td>2</td>\n",
-       "      <td>running</td>\n",
-       "      <td>1</td>\n",
-       "      <td>According to the positive sequence of the imag...</td>\n",
-       "      <td>The man is running clockwise.</td>\n",
-       "      <td>1</td>\n",
-       "      <td>./VD/video/19_2.png</td>\n",
-       "      <td>19_2_running</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1127</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>19</td>\n",
-       "      <td>2</td>\n",
-       "      <td>running</td>\n",
-       "      <td>2</td>\n",
-       "      <td>He is running counterclockwise. According to t...</td>\n",
-       "      <td>The images are not in correct order</td>\n",
-       "      <td>0</td>\n",
-       "      <td>./VD/video/19_2.png</td>\n",
-       "      <td>19_2_running</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1128</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>19</td>\n",
-       "      <td>2</td>\n",
-       "      <td>running</td>\n",
-       "      <td>3</td>\n",
-       "      <td>He is running clockwise. According to the posi...</td>\n",
-       "      <td>The images are not in correct order</td>\n",
-       "      <td>0</td>\n",
-       "      <td>./VD/video/19_2.png</td>\n",
-       "      <td>19_2_running</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>1129 rows × 12 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     category subcategory  visual_input  set_id  figure_id sample_note   \n",
-       "0          VS       chart             0       0          0      import  \\\n",
-       "1          VS       chart             0       0          0      import   \n",
-       "2          VS       chart             0       0          0      import   \n",
-       "3          VS       chart             0       0          0      import   \n",
-       "4          VS       chart             1       0          1      import   \n",
-       "...       ...         ...           ...     ...        ...         ...   \n",
-       "1124       VD       video             2      19          1     running   \n",
-       "1125       VD       video             2      19          2     running   \n",
-       "1126       VD       video             2      19          2     running   \n",
-       "1127       VD       video             2      19          2     running   \n",
-       "1128       VD       video             2      19          2     running   \n",
-       "\n",
-       "      question_id                                           question   \n",
-       "0               0  Is China, Hongkong SAR, the leading importing ...  \\\n",
-       "1               1  Is Switzerland, the leading importing country ...   \n",
-       "2               2  Is France, the leading importing country of go...   \n",
-       "3               3  Is the United States, the leading importing co...   \n",
-       "4               0  Is China, Hongkong SAR, the leading importing ...   \n",
-       "...           ...                                                ...   \n",
-       "1124            3  He is running clockwise. According to the posi...   \n",
-       "1125            0  According to the positive sequence of the imag...   \n",
-       "1126            1  According to the positive sequence of the imag...   \n",
-       "1127            2  He is running counterclockwise. According to t...   \n",
-       "1128            3  He is running clockwise. According to the posi...   \n",
-       "\n",
-       "                                      gt_answer_details  gt_answer   \n",
-       "0     Switzerland is the leading importing country o...          0  \\\n",
-       "1     Switzerland is the leading importing country o...          1   \n",
-       "2     Switzerland is the leading importing country o...          0   \n",
-       "3     Switzerland is the leading importing country o...          0   \n",
-       "4     Switzerland is the leading importing country o...          0   \n",
-       "...                                                 ...        ...   \n",
-       "1124                      The man is running clockwise.          1   \n",
-       "1125               The man is running counterclockwise.          1   \n",
-       "1126                      The man is running clockwise.          1   \n",
-       "1127                The images are not in correct order          0   \n",
-       "1128                The images are not in correct order          0   \n",
-       "\n",
-       "                 filename unique_figure_columns  \n",
-       "0                    None            0_0_import  \n",
-       "1                    None            0_0_import  \n",
-       "2                    None            0_0_import  \n",
-       "3                    None            0_0_import  \n",
-       "4      ./VS/chart/0_1.png            0_1_import  \n",
-       "...                   ...                   ...  \n",
-       "1124  ./VD/video/19_1.png          19_1_running  \n",
-       "1125  ./VD/video/19_2.png          19_2_running  \n",
-       "1126  ./VD/video/19_2.png          19_2_running  \n",
-       "1127  ./VD/video/19_2.png          19_2_running  \n",
-       "1128  ./VD/video/19_2.png          19_2_running  \n",
-       "\n",
-       "[1129 rows x 12 columns]"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>category</th>\n",
-       "      <th>subcategory</th>\n",
-       "      <th>visual_input</th>\n",
-       "      <th>set_id</th>\n",
-       "      <th>figure_id</th>\n",
-       "      <th>sample_note</th>\n",
-       "      <th>question_id</th>\n",
-       "      <th>question</th>\n",
-       "      <th>gt_answer_details</th>\n",
-       "      <th>gt_answer</th>\n",
-       "      <th>filename</th>\n",
-       "      <th>unique_figure_columns</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Is China, Hongkong SAR, the leading importing ...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>1</td>\n",
-       "      <td>Is Switzerland, the leading importing country ...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>2</td>\n",
-       "      <td>Is France, the leading importing country of go...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0</td>\n",
-       "      <td>import</td>\n",
-       "      <td>3</td>\n",
-       "      <td>Is the United States, the leading importing co...</td>\n",
-       "      <td>Switzerland is the leading importing country o...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0_0_import</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  category subcategory  visual_input  set_id  figure_id sample_note   \n",
-       "0       VS       chart             0       0          0      import  \\\n",
-       "1       VS       chart             0       0          0      import   \n",
-       "2       VS       chart             0       0          0      import   \n",
-       "3       VS       chart             0       0          0      import   \n",
-       "\n",
-       "   question_id                                           question   \n",
-       "0            0  Is China, Hongkong SAR, the leading importing ...  \\\n",
-       "1            1  Is Switzerland, the leading importing country ...   \n",
-       "2            2  Is France, the leading importing country of go...   \n",
-       "3            3  Is the United States, the leading importing co...   \n",
-       "\n",
-       "                                   gt_answer_details  gt_answer filename   \n",
-       "0  Switzerland is the leading importing country o...          0     None  \\\n",
-       "1  Switzerland is the leading importing country o...          1     None   \n",
-       "2  Switzerland is the leading importing country o...          0     None   \n",
-       "3  Switzerland is the leading importing country o...          0     None   \n",
-       "\n",
-       "  unique_figure_columns  \n",
-       "0            0_0_import  \n",
-       "1            0_0_import  \n",
-       "2            0_0_import  \n",
-       "3            0_0_import  "
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['unique_figure_columns'] == '0_0_import']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "139"
-      ]
-     },
-     "execution_count": 33,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.sample_note.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['pi',\n",
-       " 'DC_metro',\n",
-       " 'phone_sales',\n",
-       " 'population growth',\n",
-       " 'simpson',\n",
-       " 'math_prob',\n",
-       " 'population',\n",
-       " 'china_export_us',\n",
-       " 'teen_population',\n",
-       " 'para_angle',\n",
-       " 'teen_population',\n",
-       " 'math_prob',\n",
-       " 'world_war2',\n",
-       " 'sqrt2',\n",
-       " 'duck',\n",
-       " 'teen_population',\n",
-       " 'line',\n",
-       " 'illusion',\n",
-       " 'Red Velvet',\n",
-       " 'math_prob',\n",
-       " 'usmap',\n",
-       " 'central_bank',\n",
-       " 'flow',\n",
-       " 'line',\n",
-       " 'circle',\n",
-       " 'square',\n",
-       " 'Kennedy',\n",
-       " 'Berlin',\n",
-       " 'NBA',\n",
-       " 'parking']"
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "random_sample.to_list()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>category</th>\n",
-       "      <th>subcategory</th>\n",
-       "      <th>visual_input</th>\n",
-       "      <th>set_id</th>\n",
-       "      <th>figure_id</th>\n",
-       "      <th>sample_note</th>\n",
-       "      <th>question_id</th>\n",
-       "      <th>question</th>\n",
-       "      <th>gt_answer_details</th>\n",
-       "      <th>gt_answer</th>\n",
-       "      <th>filename</th>\n",
-       "      <th>unique_figure_columns</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>teen_population</td>\n",
-       "      <td>0</td>\n",
-       "      <td>From 2000-2050, is the population aged between...</td>\n",
-       "      <td>the population aged between 15 and 29 years in...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2_0_teen_population</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>teen_population</td>\n",
-       "      <td>1</td>\n",
-       "      <td>From 2000-2050, is the population aged between...</td>\n",
-       "      <td>the population aged between 15 and 29 years in...</td>\n",
-       "      <td>1</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2_0_teen_population</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>26</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>teen_population</td>\n",
-       "      <td>2</td>\n",
-       "      <td>From 2000-2050, is the population aged between...</td>\n",
-       "      <td>the population aged between 15 and 29 years in...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2_0_teen_population</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>teen_population</td>\n",
-       "      <td>3</td>\n",
-       "      <td>From 2000-2050, is the population aged between...</td>\n",
-       "      <td>the population aged between 15 and 29 years in...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2_0_teen_population</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>28</th>\n",
-       "      <td>VS</td>\n",
-       "      <td>chart</td>\n",
-       "      <td>0</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0</td>\n",
-       "      <td>teen_population</td>\n",
-       "      <td>4</td>\n",
-       "      <td>From 2000-2050, is the population aged between...</td>\n",
-       "      <td>the population aged between 15 and 29 years in...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2_0_teen_population</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1069</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>circle</td>\n",
-       "      <td>1</td>\n",
-       "      <td>According to the positive sequence of the imag...</td>\n",
-       "      <td>The circle keeping in the same place.</td>\n",
-       "      <td>1</td>\n",
-       "      <td>./VD/video/12_1.png</td>\n",
-       "      <td>12_1_circle</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1070</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>circle</td>\n",
-       "      <td>2</td>\n",
-       "      <td>According to the positive sequence of the imag...</td>\n",
-       "      <td>The circle keeping in the same place.</td>\n",
-       "      <td>0</td>\n",
-       "      <td>./VD/video/12_1.png</td>\n",
-       "      <td>12_1_circle</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1071</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>12</td>\n",
-       "      <td>2</td>\n",
-       "      <td>circle</td>\n",
-       "      <td>0</td>\n",
-       "      <td>According to the positive sequence of the imag...</td>\n",
-       "      <td>The circle keeping in the same place.</td>\n",
-       "      <td>0</td>\n",
-       "      <td>./VD/video/12_2.png</td>\n",
-       "      <td>12_2_circle</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1072</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>12</td>\n",
-       "      <td>2</td>\n",
-       "      <td>circle</td>\n",
-       "      <td>1</td>\n",
-       "      <td>According to the positive sequence of the imag...</td>\n",
-       "      <td>The circle keeping in the same place.</td>\n",
-       "      <td>1</td>\n",
-       "      <td>./VD/video/12_2.png</td>\n",
-       "      <td>12_2_circle</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1073</th>\n",
-       "      <td>VD</td>\n",
-       "      <td>video</td>\n",
-       "      <td>2</td>\n",
-       "      <td>12</td>\n",
-       "      <td>2</td>\n",
-       "      <td>circle</td>\n",
-       "      <td>2</td>\n",
-       "      <td>According to the positive sequence of the imag...</td>\n",
-       "      <td>The circle keeping in the same place.</td>\n",
-       "      <td>0</td>\n",
-       "      <td>./VD/video/12_2.png</td>\n",
-       "      <td>12_2_circle</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>352 rows × 12 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     category subcategory  visual_input  set_id  figure_id      sample_note   \n",
-       "24         VS       chart             0       2          0  teen_population  \\\n",
-       "25         VS       chart             0       2          0  teen_population   \n",
-       "26         VS       chart             0       2          0  teen_population   \n",
-       "27         VS       chart             0       2          0  teen_population   \n",
-       "28         VS       chart             0       2          0  teen_population   \n",
-       "...       ...         ...           ...     ...        ...              ...   \n",
-       "1069       VD       video             2      12          1           circle   \n",
-       "1070       VD       video             2      12          1           circle   \n",
-       "1071       VD       video             2      12          2           circle   \n",
-       "1072       VD       video             2      12          2           circle   \n",
-       "1073       VD       video             2      12          2           circle   \n",
-       "\n",
-       "      question_id                                           question   \n",
-       "24              0  From 2000-2050, is the population aged between...  \\\n",
-       "25              1  From 2000-2050, is the population aged between...   \n",
-       "26              2  From 2000-2050, is the population aged between...   \n",
-       "27              3  From 2000-2050, is the population aged between...   \n",
-       "28              4  From 2000-2050, is the population aged between...   \n",
-       "...           ...                                                ...   \n",
-       "1069            1  According to the positive sequence of the imag...   \n",
-       "1070            2  According to the positive sequence of the imag...   \n",
-       "1071            0  According to the positive sequence of the imag...   \n",
-       "1072            1  According to the positive sequence of the imag...   \n",
-       "1073            2  According to the positive sequence of the imag...   \n",
-       "\n",
-       "                                      gt_answer_details  gt_answer   \n",
-       "24    the population aged between 15 and 29 years in...          0  \\\n",
-       "25    the population aged between 15 and 29 years in...          1   \n",
-       "26    the population aged between 15 and 29 years in...          0   \n",
-       "27    the population aged between 15 and 29 years in...          0   \n",
-       "28    the population aged between 15 and 29 years in...          0   \n",
-       "...                                                 ...        ...   \n",
-       "1069              The circle keeping in the same place.          1   \n",
-       "1070              The circle keeping in the same place.          0   \n",
-       "1071              The circle keeping in the same place.          0   \n",
-       "1072              The circle keeping in the same place.          1   \n",
-       "1073              The circle keeping in the same place.          0   \n",
-       "\n",
-       "                 filename unique_figure_columns  \n",
-       "24                   None   2_0_teen_population  \n",
-       "25                   None   2_0_teen_population  \n",
-       "26                   None   2_0_teen_population  \n",
-       "27                   None   2_0_teen_population  \n",
-       "28                   None   2_0_teen_population  \n",
-       "...                   ...                   ...  \n",
-       "1069  ./VD/video/12_1.png           12_1_circle  \n",
-       "1070  ./VD/video/12_1.png           12_1_circle  \n",
-       "1071  ./VD/video/12_2.png           12_2_circle  \n",
-       "1072  ./VD/video/12_2.png           12_2_circle  \n",
-       "1073  ./VD/video/12_2.png           12_2_circle  \n",
-       "\n",
-       "[352 rows x 12 columns]"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "llm_exp",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
@@ -1,91 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cf6cd45914614a8a87e1713af52cafaa",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading readme:   0%|          | 0.00/5.47k [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "152e6fff658a4641bf0b017d9bf41e2d",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b5b65c809aee4e008de61efe6c1a6d3c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0/27 [00:00<?, ?files/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from datasets import load_dataset\n",
-    "import os \n",
-    "\n",
-    "os.environ['TRANSFORMERS_CACHE'] = '/data4/saaket/cache'\n",
-    "os.environ['HF_HOME'] = '/data4/saaket/cache'\n",
-    "os.environ['HF_DATASETS_CACHE'] = '/data4/saaket/cache'\n",
-    "\n",
-    "dataset = load_dataset(\"osunlp/Multimodal-Mind2Web\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "llm_exp",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
@@ -1,165 +0,0 @@
-import os 
-import json 
-import pandas as pd 
-from collections import Counter 
-
-random_sels = ['pi',
- 'DC_metro',
- 'phone_sales',
- 'population growth',
- 'simpson',
- 'math_prob',
- 'population',
- 'china_export_us',
- 'teen_population',
- 'para_angle',
- 'teen_population',
- 'math_prob',
- 'world_war2',
- 'sqrt2',
- 'duck',
- 'teen_population',
- 'line',
- 'illusion',
- 'Red Velvet',
- 'math_prob',
- 'usmap',
- 'central_bank',
- 'flow',
- 'line',
- 'circle',
- 'square',
- 'Kennedy',
- 'Berlin',
- 'NBA',
- 'parking']
-
-def open_log_file(file):
-    with open(file, 'r') as f:
-        data = json.load(f)
-    return data
-
-def collect_date_split_answers():    
-    debate_baseline_31st_file = '/home/saaket/lmm-agents/logs/2024_31_5/baseline_answers_20240530_200504.json'
-    debate_31st_file = '/home/saaket/lmm-agents/logs/2024_31_5/answers_debate_20240530_200504.json'
-    
-    actor_critic_30th_file = '/home/saaket/lmm-agents/logs/2024_30_5/answers_debate_20240530_200754.json'
-
-    actor_critic = open_log_file(actor_critic_30th_file)
-
-    debate_baseline_31st = open_log_file(debate_baseline_31st_file)
-    debate_31st = open_log_file(debate_31st_file)
-    debate = debate_31st
-    debate_baseline = debate_baseline_31st
-    assert len(debate) == len(debate_baseline) == len(actor_critic)
-    return debate, debate_baseline, actor_critic
-
-
-
-
-def full_debate_compile(answers):
-    
-    # df = pd.read_json('/home/saaket/lmm-agents/lmm_debate/got_these_wrong_before_vd.json')
-    # df = df.reset_index() 
-
-    # df = df.iloc[:240]
-    hallusion_path = '/data4/saaket/hallusion_bench/'
-    data_file = os.path.join(hallusion_path, 'HallusionBench.json')
-    with open(data_file, 'r') as f:
-        data = json.load(f)
-    df = pd.DataFrame(data)
-    
-    df = df[df['sample_note'].isin(random_sels)]
-
-    baseline_answers = open_log_file(answers['baseline'])
-    debate_answers = open_log_file(answers['debate'])
-
-    baseline_answers_df = df.copy()
-    debate_answers_df = df.copy()
-
-    baseline_answers_df['model_prediction'] = baseline_answers
-    debate_answers_df['model_prediction'] = debate_answers
-
-    # make dirs
-    os.makedirs('output/full_debate_corrected_mod_prompt/debate_baseline', exist_ok=True)
-    os.makedirs('output/full_debate_corrected_mod_prompt/debate', exist_ok=True)
-
-    debate_answers_df.to_json('output/full_debate_corrected_mod_prompt/debate/HallusionBench_result.json', orient='records')
-    baseline_answers_df.to_json('output/full_debate_corrected_mod_prompt/debate_baseline/HallusionBench_result.json', orient='records')
-
-
-def compile_in_hallusion_format_old():
-    debate, debate_baseline, actor_critic = collect_date_split_answers()
-    
-    # Load the hallusionbench data
-    hallusion_path = '/data4/saaket/hallusion_bench/'
-    data_file = os.path.join(hallusion_path, 'HallusionBench.json')
-    with open(data_file, 'r') as f:
-        data = json.load(f)
-    df = pd.DataFrame(data)
-    print(df)
-    #### DATA FILTERING #####
-
-    # What is the size of the df and the distribution of categories and subcategories
-    print(len(df))
-    print(Counter(df['category']))
-    print(Counter(df['subcategory']))
-
-
-    # Let's only keep the visual dependent examples (VD)
-    df_vd = df.loc[df['category'] == 'VD'].reset_index(drop=True)
-    print(df_vd)
-    # Create three new dataframed debate_answers, debate_baseline_answers, and actor_critic_answers
-    # Each datafram is df_vd with a new column with the key model_prediction which will correspond to the corresponding answers
-    # from the debate, debate_baseline, and actor_critic lists
-    debate_answers = df_vd.copy()
-    debate_baseline_answers = df_vd.copy()
-    actor_critic_answers = df_vd.copy()
-    
-    debate_answers['model_prediction'] = debate
-    debate_baseline_answers['model_prediction'] = debate_baseline
-    actor_critic_answers['model_prediction'] = actor_critic
-
-    # Save the three dataframes as json files named HallusionBench_result.json in three different folders
-    # Name the folders debate, debate_baseline, and actor_critic in the current directory, make folders if they dont exist
-    os.makedirs('debate', exist_ok=True)
-    os.makedirs('debate_baseline', exist_ok=True)
-    os.makedirs('actor_critic', exist_ok=True)
-    
-    debate_answers.to_json('debate/HallusionBench_result.json', orient='records')
-    debate_baseline_answers.to_json('debate_baseline/HallusionBench_result.json', orient='records')
-    actor_critic_answers.to_json('actor_critic/HallusionBench_result.json', orient='records')
-
-    print("Saved the three dataframes as json files named HallusionBench_result.json in three different folders")
-
-
-def collect_baseline_answers():
-    logs = os.listdir('/home/saaket/lmm-agents/logs/2024_30_5')
-
-    for log in logs:
-        if log.startswith("answers"):
-            logs.remove(log)
-        elif log.startswith("baseline"):
-            logs.remove(log)
-
-
-    baseline_answers = []
-    for i, log in enumerate(logs):
-        with open(f'/home/saaket/lmm-agents/logs/2024_30_5/{log}', 'r') as f:
-            data = json.load(f)
-        print(data)
-        baseline_answers.append(data[1].replace("My answer: ", ''))
-
-    with open(f'/home/saaket/lmm-agents/logs/2024_30_5/answers_baseline.json', 'w') as f:
-        json.dump(baseline_answers, f)
-
-        
-if __name__ == '__main__':
-    # collect_baseline_answers()
-    # print(collect_date_split_answers()[0][:5])
-    # print("Collected baseline answers.
-    # compile_in_hallusion_format()
-    full_debate_compile(
-        {'baseline': '/home/saaket/lmm-agents/logs/2024_10_6/baseline_answers_20240610_135648.json',
-         'debate': '/home/saaket/lmm-agents/logs/2024_10_6/answers_debate_20240610_135648.json'}
-    )
@@ -1,2 +0,0 @@
-LOG_PATH = '/home/saaket/lmm-agents/logs'
-DATA_PATH = '/home/saaket/lmm-agents/data'
@@ -1,5 +0,0 @@
-DEBATER_SYSTEM_PROMPT = '''You are a debate agent. You will be asked a question and in some cases provided an image. You will try to answer the question to the best of your abilities. You will have a competitor. You do not always have to agree with your competitor. Note: no more information can be provided, the debate has to be settled based on the available evidence. '''
-MODERATOR_SYSTEM_PROMPT = '''You are a moderator. There will be two debaters involved in a competition to answer a question which might have an image as the context in some cases. At the end of each round, you will evaluate the candidates' answers. If you think one of the debater is correct, you should end your response with Final Answer: <The Final Correct Answer based on the debate>. If not, do not say Final Answer, just say - Debate goes on. Note: no more information can be provided, the debate has to be settled based on the available evidence. '''
-MODERATOR_FINAL_PROMPT = '''You are a moderator. There will be two debaters involved in a competition to answer a question which might have an image as the context in some cases. You will be provided the full debate with arguments of two debaters. You will evaluate the candidates' arguments and select the final answer. Format you response as: Analysis:<Step by step Analysis of the debate>. Final Answer: <The Final Correct Answer to the question based on the debate.>. Note: no more information can be provided, the debate has to be settled based on the available evidence. ''' 
-ACTOR_PROMPT = '''You will be provded an image and asked a question about it. You will try to answer the question to the best of your abilities. You will receive feedback from a critic if your answer is incorrect. You will have to provide a new answer based on the feedback.'''
-CRITIC_PROMPT = '''You are a critic in an image question answer challenge. You will be provided with a question about an image and my answer to the question. Critique my reasoning and tell me if my answer is correct. If you think my answer is correct end your response with Verification: Success. Else just give me the critique and say Verification: Failure.'''
@@ -16,7 +16,7 @@ setup(
        'openai',
        'transformers',
        'fastapi',
-        'openaci @ git+https://github.com/simular-ai/OpenACI.git@main#egg=openaci',
+        'openaci @ git+https://github.com/simular-ai/OpenACI',
    ],
    classifiers=[
        'Programming Language :: Python :: 3',
				`@@ -0,0 +1 @@`
				{"disable autosave please": "{\"nodes\":[{\"name\":\"Open Settings\",\"info\":\"Click on the Code menu in the top menu bar. Select Preferences, then Settings. Alternatively, use the shortcut Cmd + ,.\"},{\"name\":\"Search for Auto Save\",\"info\":\"In the Settings editor, type \\\"autosave\\\" into the search bar.\"},{\"name\":\"Disable Auto Save\",\"info\":\"Find the setting labeled Files: Auto Save. Click on the dropdown menu next to it and select off.\"}],\"edges\":[[{\"name\":\"Open Settings\",\"info\":\"Click on the Code menu in the top menu bar. Select Preferences, then Settings. Alternatively, use the shortcut Cmd + ,.\"},{\"name\":\"Search for Auto Save\",\"info\":\"In the Settings editor, type \\\"autosave\\\" into the search bar.\"}],[{\"name\":\"Search for Auto Save\",\"info\":\"In the Settings editor, type \\\"autosave\\\" into the search bar.\"},{\"name\":\"Disable Auto Save\",\"info\":\"Find the setting labeled Files: Auto Save. Click on the dropdown menu next to it and select off.\"}]]}"}
				`@@ -0,0 +1 @@`
				{"disable autosave please": "To disable autosave in Visual Studio Code on macOS, follow these steps:\n\n1. Open Settings:\n - Click on the Code menu in the top menu bar.\n - Select Preferences, then Settings. Alternatively, use the shortcut Cmd + ,.\n\n2. Search for Auto Save:\n - In the Settings editor, type \"autosave\" into the search bar.\n\n3. Disable Auto Save:\n - Find the setting labeled Files: Auto Save.\n - Click on the dropdown menu next to it and select off.\n\nThese steps will disable the auto-save feature, requiring manual saving of files."}