add api setup instructions, remove stray and deprecated Engine and Agent codes

Esse commit está contido em:
Saaket Agashe
2024-10-17 11:02:31 -07:00
commit 62d012447f
5 arquivos alterados com 33 adições e 341 exclusões
externo
BIN
Ver Arquivo
Arquivo binário não exibido.
+2 -1
Ver Arquivo
@@ -160,4 +160,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
logs/
logs/
.DS_Store
+20
Ver Arquivo
@@ -48,6 +48,26 @@ Install the agent_s package and dependencies
pip install -e .
```
Set your LLM API Keys and other environment variables. You can do this by adding the following lines to your .bashrc (Linux), or .zshrc (MacOS) file. We support OpenAI, Azure OpenAI, Anthropic, and vLLM models.
1. OpenAI
```
export OPENAI_API_KEY=<YOUR_API_KEY>
```
2. Anthropic
```
export ANTHROPIC_API_KEY=<YOUR_API_KEY>
```
3. OpenAI on Azure
```
export AZURE_OPENAI_API_BASE=<DEPLOYMENT_NAME>
export AZURE_OPENAI_API_KEY=<YOUR_API_KEY>
```
4. vLLM for Local Models
```
export vLLM_ENDPOINT_URL=<YOUR_DEPLOYMENT_URL>
```
### Setup Retrieval from Web using Perplexica
1. Ensure Docker is installed and running on your system.
+11 -153
Ver Arquivo
@@ -5,24 +5,12 @@
from agent_s.MultimodalEngine import (
LMMEngineOpenAI,
LMMEngineAzureOpenAI,
LMMEngineLlava,
LMMEngineCogVLM,
LMMEnginevLLM,
LMMEngineAnthropic,
LMMEngineQwen,
)
import base64
import re
# TODO: Import only if module exists, else ignore
# from llava.constants import (
# IMAGE_TOKEN_INDEX,
# DEFAULT_IMAGE_TOKEN,
# DEFAULT_IM_START_TOKEN,
# DEFAULT_IM_END_TOKEN,
# IMAGE_PLACEHOLDER,
# )
data_type_map = {
"openai": {"image_url": "image_url"},
"anthropic": {"image_url": "image"},
@@ -42,12 +30,6 @@ class LMMAgent:
self.engine = LMMEngineAzureOpenAI(**engine_params)
elif engine_type == "vllm":
self.engine = LMMEnginevLLM(**engine_params)
elif engine_type == "qwen":
self.engine = LMMEngineQwen(**engine_params)
elif engine_type == "llava":
self.engine = LMMEngineLlava(**engine_params)
elif engine_type == "cogvlm":
self.engine = LMMEngineCogVLM(**engine_params)
else:
raise ValueError("engine_type must be either 'openai' or 'azure'")
else:
@@ -73,15 +55,13 @@ class LMMAgent:
def reset(
self,
):
if isinstance(self.engine, (LMMEngineCogVLM, LMMEngineLlava)):
self.messages = []
else:
self.messages = [
{
"role": "system",
"content": [{"type": "text", "text": self.system_prompt}],
}
]
self.messages = [
{
"role": "system",
"content": [{"type": "text", "text": self.system_prompt}],
}
]
def add_system_prompt(self, system_prompt):
self.system_prompt = system_prompt
@@ -98,12 +78,6 @@ class LMMAgent:
}
)
# Don't add the system prompt if we are using llava or other hf models
if isinstance(self.engine, LMMEngineLlava) or isinstance(
self.engine, LMMEngineCogVLM
):
self.messages = []
def remove_message_at(self, index):
"""Remove a message at a given index"""
if index < len(self.messages):
@@ -135,80 +109,8 @@ class LMMAgent:
):
"""Add a new message to the list of messages"""
# For inference from locally hosted llava based on https://github.com/haotian-liu/LLaVA/
if isinstance(self.engine, LMMEngineLlava):
# No system prompt so first message will be from user
if len(self.messages) == 0:
role = "user"
else:
# infer role from previous message
if self.messages[-1]["role"] == "user":
role = "assistant"
elif self.messages[-1]["role"] == "assistant":
role = "user"
image_token_se = (
DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
)
qs = text_content
if role == "user":
if len(self.messages) == 0:
# If this is the very first user message, add the system prompt to it to dictate behavior
qs = self.system_prompt + "\n" + qs
# TODO: Add comment explaining what this next part does
if IMAGE_PLACEHOLDER in qs:
if self.engine.model.config.mm_use_im_start_end:
qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
else:
qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
else:
if self.engine.model.config.mm_use_im_start_end:
qs = image_token_se + "\n" + qs
else:
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
message = {"role": role, "content": qs}
else:
message = {"role": role, "content": text_content}
# Capable of handling only one image right now. TODO: make capable of handling more images
if image_content:
if self.engine.args.image_file == None:
self.engine.args.image_file = image_content
self.messages.append(message)
elif isinstance(self.engine, LMMEngineCogVLM):
# No system prompt so first message will be from user
if len(self.messages) == 0:
role = "user"
else:
# infer role from previous message
if self.messages[-1]["role"] == "user":
role = "assistant"
elif self.messages[-1]["role"] == "assistant":
role = "user"
# Add message content as a new message, if this is the first message prepend with system prompt
if len(self.messages) == 0:
self.messages.append(
{
"role": role,
"content": {
"type": "text",
"text": self.system_prompt + "\n\n" + text_content,
},
}
)
else:
self.messages.append(
{"role": role, "content": {"type": "text", "text": text_content}}
)
# For API-style inference from OpenAI and AzureOpenAI
elif isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)):
# API-style inference from OpenAI and AzureOpenAI
if isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)):
# infer role from previous message
if role != "user":
if self.messages[-1]["role"] == "system":
@@ -299,8 +201,8 @@ class LMMAgent:
)
self.messages.append(message)
# Custom Qwen Model inference
elif isinstance(self.engine, LMMEngineQwen):
# Locally hosted vLLM model inference
elif isinstance(self.engine, LMMEnginevLLM):
# infer role from previous message
if role != "user":
if self.messages[-1]["role"] == "system":
@@ -338,50 +240,6 @@ class LMMAgent:
)
self.messages.append(message)
# Custom Llama3.2 Model inference
elif isinstance(self.engine, LMMEngineTogether):
# infer role from previous message
if role != "user":
if self.messages[-1]["role"] == "system":
role = "user"
elif self.messages[-1]["role"] == "user":
role = "assistant"
elif self.messages[-1]["role"] == "assistant":
role = "user"
message = {
"role": role,
"content": [{"type": "text", "text": text_content}],
}
if image_content:
# Check if image_content is a list or a single image
if isinstance(image_content, list):
# If image_content is a list of images, loop through each image
for image in image_content:
base64_image = self.encode_image(image)
message["content"].append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}",
},
}
)
else:
# If image_content is a single image, handle it directly
base64_image = self.encode_image(image_content)
message["content"].append(
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}",
},
}
)
self.messages.append(message)
def get_response(
self,
user_message=None,
-187
Ver Arquivo
@@ -122,33 +122,6 @@ class LMMEngineAnthropic(LMMEngine):
**kwargs,
).content[0].text
class LMMEngineQwen(LMMEngine):
def __init__(self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs):
self.model = model
self.api_key = api_key
self.base_url = base_url or os.getenv("QWEN_ENDPOINT_URL")
if self.base_url is None:
raise ValueError("An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL")
def generate(self, messages, temperature=0., max_new_tokens=None, **kwargs):
'''Generate the next message based on previous messages'''
data = {
'messages': messages,
}
response = requests.post(self.base_url, json=data)
# Check the response
if response.status_code == 200:
return response.json()['response'][0]
else:
print(f"Qwen LLM generation failed with status code: {response.status_code}")
print("Error message:", response.text)
class OpenAIEmbeddingEngine(LMMEngine):
def __init__(
self,
@@ -255,163 +228,3 @@ class LMMEnginevLLM(LMMEngine):
extra_body={"repetition_penalty": repetition_penalty},
)
return completion.choices[0].message.content
class LMMEngineLlava(LMMEngine):
def __init__(self, model_path=None, model = None, tokenizer=None, image_processor=None, context_len=None, max_new_tokens=None, rate_limit=-1, **kwargs):
assert model_path is not None, "model path must be provided"
self.model_path = model_path
self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
self.args = type('Args', (), {
"model_path": model_path,
"model_base": None,
"model_name": get_model_name_from_path(model_path),
"query": None,
"conv_mode": None,
"image_file": None,
"sep": ",",
"temperature": 0.,
"top_p":1,
"num_beams": 1,
"max_new_tokens": max_new_tokens if max_new_tokens else 2048
})()
if not model:
self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
model_path, None, self.args.model_name)
else:
self.tokenizer = tokenizer
self.model = model
self.image_processor = image_processor
self.context_len = context_len
# Check model base type for conversation template
if "llama-2" in self.args.model_name.lower():
self.args.conv_mode = "llava_llama_2"
elif "v1" in self.args.model_name.lower():
self.args.conv_mode = "llava_v1"
elif "mpt" in self.args.model_name.lower():
self.conv_mode = "mpt"
else:
self.args.conv_mode = "llava_v0"
self.conversation = conv_templates[self.args.conv_mode].copy()
def generate(self, messages, image=None, temperature=0., max_new_tokens=None, **kwargs):
# Refresh the conversation holder everytime
self.conversation = conv_templates[self.args.conv_mode].copy()
'''Generate the next message based on previous messages'''
for idx, message in enumerate(messages):
self.conversation.append_message(self.conversation.roles[idx % 2], message['content'])
# Add the "ASSISTANT:" starter before generation
self.conversation.append_message(self.conversation.roles[1], None)
prompt = self.conversation.get_prompt()
self.args.image_files = [self.args.image_file]
image_files = image_parser(self.args)
images = load_images(image_files)
image_sizes = [x.size for x in images]
images_tensor = process_images(
images,
self.image_processor,
self.model.config
).to(self.model.device, dtype=torch.float16)
input_ids = (
tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
.unsqueeze(0)
.cuda()
)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=images_tensor,
image_sizes=image_sizes,
do_sample=True if self.args.temperature > 0 else False,
temperature=self.args.temperature,
top_p=self.args.top_p,
num_beams=self.args.num_beams,
max_new_tokens=self.args.max_new_tokens,
)
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
return outputs
class LMMEngineCogVLM(LMMEngine):
def __init__(self, model_path=None, model = None, tokenizer=None, image_processor=None, context_len=None, max_new_tokens=None, device=None, rate_limit=-1, **kwargs):
assert model_path is not None, "model path must be provided"
self.model_path = model_path
self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit
if device:
self.device = device
else: self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.torch_type = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
self.gen_kwargs = {
"max_new_tokens": 2048,
"pad_token_id": 128002,
}
if not model:
self.tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True
)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=self.torch_type,
trust_remote_code=True
).eval().to(self.device)
else:
self.tokenizer = tokenizer
self.model = model
self.history = None
def generate(self, messages, image=None, temperature=0., max_new_tokens=None, **kwargs):
'''Generate the next message based on previous messages'''
if image:
image = Image.open(image).convert('RGB')
history = []
if len(messages) > 1:
history_list = [m["content"]["text"] for m in messages[:-1]]
# Group two messages at a time add them as a tuple to history
history = list(zip(history_list[0::2], history_list[1::2]))
if image is None:
input_by_model = self.model.build_conversation_input_ids(
self.tokenizer,
query=messages[-1]["content"]["text"],
history=history,
template_version='chat'
)
else:
input_by_model = self.model.build_conversation_input_ids(
self.tokenizer,
query=messages[-1]["content"]["text"],
history=history,
images=[image],
template_version='chat'
)
inputs = {
'input_ids': input_by_model['input_ids'].unsqueeze(0).to(self.device),
'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(self.device),
'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(self.device),
'images': [[input_by_model['images'][0].to(self.device).to(self.torch_type)]] if image is not None else None,
}
with torch.no_grad():
outputs = self.model.generate(**inputs, **self.gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return respons