add api setup instructions, remove stray and deprecated Engine and Agent codes

2024-10-17 11:02:31 -07:00
commit 62d012447f
@@ -160,4 +160,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-logs/
+logs/
 .DS_Store
@@ -48,6 +48,26 @@ Install the agent_s package and dependencies
 pip install -e .
 ```
 Set your LLM API Keys and other environment variables. You can do this by adding the following lines to your .bashrc (Linux), or .zshrc (MacOS) file. We support OpenAI, Azure OpenAI, Anthropic, and vLLM models.
 1. OpenAI
 ```
 export OPENAI_API_KEY=<YOUR_API_KEY>
 ```
 2. Anthropic
 ```
 export ANTHROPIC_API_KEY=<YOUR_API_KEY>
 ```
 3. OpenAI on Azure
 ```
 export AZURE_OPENAI_API_BASE=<DEPLOYMENT_NAME>
 export AZURE_OPENAI_API_KEY=<YOUR_API_KEY>
 ```
 4. vLLM for Local Models
 ```
 export vLLM_ENDPOINT_URL=<YOUR_DEPLOYMENT_URL>
 ```
 ### Setup Retrieval from Web using Perplexica
 1. Ensure Docker is installed and running on your system.
@@ -5,24 +5,12 @@
 from agent_s.MultimodalEngine import (
    LMMEngineOpenAI,
    LMMEngineAzureOpenAI,
    LMMEngineLlava,
    LMMEngineCogVLM,
    LMMEnginevLLM,
    LMMEngineAnthropic,
    LMMEngineQwen,
 )
 import base64
 import re
 # TODO: Import only if module exists, else ignore
 # from llava.constants import (
 #     IMAGE_TOKEN_INDEX,
 #     DEFAULT_IMAGE_TOKEN,
 #     DEFAULT_IM_START_TOKEN,
 #     DEFAULT_IM_END_TOKEN,
 #     IMAGE_PLACEHOLDER,
 # )
 data_type_map = {
    "openai": {"image_url": "image_url"},
    "anthropic": {"image_url": "image"},
@@ -42,12 +30,6 @@ class LMMAgent:
                    self.engine = LMMEngineAzureOpenAI(**engine_params)
                elif engine_type == "vllm":
                    self.engine = LMMEnginevLLM(**engine_params)
                elif engine_type == "qwen":
                    self.engine = LMMEngineQwen(**engine_params)
                elif engine_type == "llava":
                    self.engine = LMMEngineLlava(**engine_params)
                elif engine_type == "cogvlm":
                    self.engine = LMMEngineCogVLM(**engine_params)
                else:
                    raise ValueError("engine_type must be either 'openai' or 'azure'")
            else:
@@ -73,15 +55,13 @@ class LMMAgent:
    def reset(
        self,
    ):
-        if isinstance(self.engine, (LMMEngineCogVLM, LMMEngineLlava)):
+
-            self.messages = []
+        self.messages = [
-        else:
+            {
-            self.messages = [
+                "role": "system",
-                {
+                "content": [{"type": "text", "text": self.system_prompt}],
-                    "role": "system",
+            }
-                    "content": [{"type": "text", "text": self.system_prompt}],
+        ]
                }
            ]
    def add_system_prompt(self, system_prompt):
        self.system_prompt = system_prompt
@@ -98,12 +78,6 @@ class LMMAgent:
                }
            )
        # Don't add the system prompt if we are using llava or other hf models
        if isinstance(self.engine, LMMEngineLlava) or isinstance(
            self.engine, LMMEngineCogVLM
        ):
            self.messages = []
    def remove_message_at(self, index):
        """Remove a message at a given index"""
        if index < len(self.messages):
@@ -135,80 +109,8 @@ class LMMAgent:
    ):
        """Add a new message to the list of messages"""
-        # For inference from locally hosted llava based on https://github.com/haotian-liu/LLaVA/
+        # API-style inference from OpenAI and AzureOpenAI
-        if isinstance(self.engine, LMMEngineLlava):
+        if isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)):
            # No system prompt so first message will be from user
            if len(self.messages) == 0:
                role = "user"
            else:
                # infer role from previous message
                if self.messages[-1]["role"] == "user":
                    role = "assistant"
                elif self.messages[-1]["role"] == "assistant":
                    role = "user"
            image_token_se = (
                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
            )
            qs = text_content
            if role == "user":
                if len(self.messages) == 0:
                    # If this is the very first user message, add the system prompt to it to dictate behavior
                    qs = self.system_prompt + "\n" + qs
                    # TODO: Add comment explaining what this next part does
                    if IMAGE_PLACEHOLDER in qs:
                        if self.engine.model.config.mm_use_im_start_end:
                            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
                        else:
                            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
                    else:
                        if self.engine.model.config.mm_use_im_start_end:
                            qs = image_token_se + "\n" + qs
                        else:
                            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
                message = {"role": role, "content": qs}
            else:
                message = {"role": role, "content": text_content}
            # Capable of handling only one image right now. TODO: make capable of handling more images
            if image_content:
                if self.engine.args.image_file == None:
                    self.engine.args.image_file = image_content
            self.messages.append(message)
        elif isinstance(self.engine, LMMEngineCogVLM):
            # No system prompt so first message will be from user
            if len(self.messages) == 0:
                role = "user"
            else:
                # infer role from previous message
                if self.messages[-1]["role"] == "user":
                    role = "assistant"
                elif self.messages[-1]["role"] == "assistant":
                    role = "user"
            # Add message content as a new message, if this is the first message prepend with system prompt
            if len(self.messages) == 0:
                self.messages.append(
                    {
                        "role": role,
                        "content": {
                            "type": "text",
                            "text": self.system_prompt + "\n\n" + text_content,
                        },
                    }
                )
            else:
                self.messages.append(
                    {"role": role, "content": {"type": "text", "text": text_content}}
                )
        # For API-style inference from OpenAI and AzureOpenAI
        elif isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)):
            # infer role from previous message
            if role != "user":
                if self.messages[-1]["role"] == "system":
@@ -299,8 +201,8 @@ class LMMAgent:
                        )
            self.messages.append(message)
-        # Custom Qwen Model inference
+        # Locally hosted vLLM model inference 
-        elif isinstance(self.engine, LMMEngineQwen):
+        elif isinstance(self.engine, LMMEnginevLLM):
           # infer role from previous message
            if role != "user":
                if self.messages[-1]["role"] == "system":
@@ -338,50 +240,6 @@ class LMMAgent:
                    )
            self.messages.append(message)
        # Custom Llama3.2 Model inference
        elif isinstance(self.engine, LMMEngineTogether):
            # infer role from previous message
            if role != "user":
                if self.messages[-1]["role"] == "system":
                    role = "user"
                elif self.messages[-1]["role"] == "user":
                    role = "assistant"
                elif self.messages[-1]["role"] == "assistant":
                    role = "user"
            message = {
                "role": role,
                "content": [{"type": "text", "text": text_content}],
            }
            if image_content:
                # Check if image_content is a list or a single image
                if isinstance(image_content, list):
                    # If image_content is a list of images, loop through each image
                    for image in image_content:
                        base64_image = self.encode_image(image)
                        message["content"].append(
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{base64_image}",
                                },
                            }
                        )
                else:
                    # If image_content is a single image, handle it directly
                    base64_image = self.encode_image(image_content)
                    message["content"].append(
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{base64_image}",
                            },
                        }
                    )
            self.messages.append(message)
    def get_response(
        self,
        user_message=None,
@@ -122,33 +122,6 @@ class LMMEngineAnthropic(LMMEngine):
            **kwargs,
        ).content[0].text
 class LMMEngineQwen(LMMEngine):
    def __init__(self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs):
        self.model = model
        self.api_key = api_key
        self.base_url = base_url or os.getenv("QWEN_ENDPOINT_URL")
        if self.base_url is None:
            raise ValueError("An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL")
    def generate(self, messages, temperature=0., max_new_tokens=None, **kwargs):
        '''Generate the next message based on previous messages'''
        data = {
            'messages': messages,
        }
        response = requests.post(self.base_url, json=data)
        # Check the response
        if response.status_code == 200:
            return response.json()['response'][0]
        else:
            print(f"Qwen LLM generation failed with status code: {response.status_code}")
            print("Error message:", response.text)
 class OpenAIEmbeddingEngine(LMMEngine):
    def __init__(
        self,
@@ -255,163 +228,3 @@ class LMMEnginevLLM(LMMEngine):
            extra_body={"repetition_penalty": repetition_penalty},
        )
        return completion.choices[0].message.content
 class LMMEngineLlava(LMMEngine):
    def __init__(self, model_path=None, model = None, tokenizer=None, image_processor=None, context_len=None, max_new_tokens=None, rate_limit=-1, **kwargs):
        assert model_path is not None, "model path must be provided"
        self.model_path = model_path
        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
        self.args = type('Args', (), {
            "model_path": model_path,
            "model_base": None,
            "model_name": get_model_name_from_path(model_path),
            "query": None,
            "conv_mode": None,
            "image_file": None,
            "sep": ",",
            "temperature": 0.,
            "top_p":1,
            "num_beams": 1,
            "max_new_tokens": max_new_tokens if max_new_tokens else 2048
        })()
        if not model:
            self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
            model_path, None, self.args.model_name)
        else:
            self.tokenizer = tokenizer
            self.model = model
            self.image_processor = image_processor
            self.context_len = context_len
        # Check model base type for conversation template
        if "llama-2" in self.args.model_name.lower():
            self.args.conv_mode = "llava_llama_2"
        elif "v1" in self.args.model_name.lower():
            self.args.conv_mode = "llava_v1"
        elif "mpt" in self.args.model_name.lower():
            self.conv_mode = "mpt"
        else:
            self.args.conv_mode = "llava_v0"
        self.conversation = conv_templates[self.args.conv_mode].copy()
    def generate(self, messages, image=None, temperature=0., max_new_tokens=None, **kwargs):
        # Refresh the conversation holder everytime
        self.conversation = conv_templates[self.args.conv_mode].copy()
        '''Generate the next message based on previous messages'''
        for idx, message in enumerate(messages):
            self.conversation.append_message(self.conversation.roles[idx % 2], message['content'])
        # Add the "ASSISTANT:" starter before generation
        self.conversation.append_message(self.conversation.roles[1], None)
        prompt = self.conversation.get_prompt()
        self.args.image_files = [self.args.image_file]
        image_files = image_parser(self.args)
        images = load_images(image_files)
        image_sizes = [x.size for x in images]
        images_tensor = process_images(
            images,
            self.image_processor,
            self.model.config
        ).to(self.model.device, dtype=torch.float16)
        input_ids = (
            tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
            .unsqueeze(0)
            .cuda()
        )
        with torch.inference_mode():
            output_ids = self.model.generate(
                input_ids,
                images=images_tensor,
                image_sizes=image_sizes,
                do_sample=True if self.args.temperature > 0 else False,
                temperature=self.args.temperature,
                top_p=self.args.top_p,
                num_beams=self.args.num_beams,
                max_new_tokens=self.args.max_new_tokens,
            )
        outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
        return outputs
 class LMMEngineCogVLM(LMMEngine):
    def __init__(self, model_path=None, model = None, tokenizer=None, image_processor=None, context_len=None, max_new_tokens=None, device=None, rate_limit=-1, **kwargs):
        assert model_path is not None, "model path must be provided"
        self.model_path = model_path
        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
        if device:
            self.device = device
        else: self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.torch_type = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
        self.gen_kwargs = {
            "max_new_tokens": 2048,
            "pad_token_id": 128002,
        }
        if not model:
            self.tokenizer = AutoTokenizer.from_pretrained(
                model_path,
                trust_remote_code=True
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=self.torch_type,
                trust_remote_code=True
            ).eval().to(self.device)
        else:
            self.tokenizer = tokenizer
            self.model = model
        self.history = None
    def generate(self, messages, image=None, temperature=0., max_new_tokens=None, **kwargs):
        '''Generate the next message based on previous messages'''
        if image:
            image = Image.open(image).convert('RGB')
        history = []
        if len(messages) > 1:
            history_list = [m["content"]["text"] for m in messages[:-1]]
            # Group two messages at a time add them as a tuple to history
            history = list(zip(history_list[0::2], history_list[1::2]))
        if image is None:
            input_by_model = self.model.build_conversation_input_ids(
                self.tokenizer,
                query=messages[-1]["content"]["text"],
                history=history,
                template_version='chat'
            )
        else:
            input_by_model = self.model.build_conversation_input_ids(
                self.tokenizer,
                query=messages[-1]["content"]["text"],
                history=history,
                images=[image],
                template_version='chat'
            )
        inputs = {
            'input_ids': input_by_model['input_ids'].unsqueeze(0).to(self.device),
            'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(self.device),
            'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(self.device),
            'images': [[input_by_model['images'][0].to(self.device).to(self.torch_type)]] if image is not None else None,
        }
        with torch.no_grad():
            outputs = self.model.generate(**inputs, **self.gen_kwargs)
            outputs = outputs[:, inputs['input_ids'].shape[1]:]
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return respons