Spaces:
Running
Running
import asyncio | |
import json | |
import os | |
import aiohttp | |
import gradio as gr | |
import numpy as np | |
import spaces | |
from huggingface_hub import InferenceClient | |
import random | |
import torch | |
from huggingface_hub import AsyncInferenceClient | |
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer | |
async def query_llm(payload, model_name): | |
headers = {"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"} | |
async with aiohttp.ClientSession() as session: | |
async with session.post(f"https://api-inference.huggingface.co/models/{model_name}", headers=headers, | |
json=payload) as response: | |
return await response.json() | |
async def generate_mistral_7bvo1(system_input, user_input): | |
client = AsyncInferenceClient( | |
"mistralai/Mistral-7B-Instruct-v0.1", | |
token=os.getenv('HF_TOKEN'), | |
) | |
async for message in await client.chat_completion( | |
messages=[ | |
{"role": "system", "content": system_input}, | |
{"role": "user", "content": user_input}, ], | |
max_tokens=256, | |
stream=True, | |
): | |
yield message.choices[0].delta.content | |
async def generate_gpt2(system_input, user_input): | |
output = await query_llm({ | |
"inputs": (inputs:=f"{system_input}\n{user_input}"), | |
}, "openai-community/gpt2") | |
yield output[0]["generated_text"] | |
async def generate_llama2(system_input, user_input): | |
client = AsyncInferenceClient( | |
"meta-llama/Llama-2-7b-chat-hf", | |
token=os.getenv('HF_TOKEN') | |
) | |
async for message in await client.chat_completion( | |
messages=[ | |
{"role": "system", "content": system_input}, | |
{"role": "user", "content": user_input}, ], | |
max_tokens=256, | |
stream=True, | |
): | |
yield message.choices[0].delta.content | |
def generate_openllama(system_input, user_input): | |
model_path = 'openlm-research/open_llama_3b_v2' | |
tokenizer = LlamaTokenizer.from_pretrained(model_path) | |
model = LlamaForCausalLM.from_pretrained( | |
model_path, torch_dtype=torch.float16, device_map='cuda', | |
) | |
print('model openllama loaded') | |
input_text = f"{system_input}\n{user_input}" | |
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") | |
output = model.generate(input_ids, max_length=128) | |
return tokenizer.decode(output[0], skip_special_tokens=True) | |
def generate_bloom(system_input, user_input): | |
model_path = 'bigscience/bloom-7b1' | |
tokenizer = AutoTokenizer.from_pretrained(model_path) | |
model = LlamaForCausalLM.from_pretrained( | |
model_path, torch_dtype=torch.float16, device_map='cuda', | |
) | |
input_text = f"{system_input}\n{user_input}" | |
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda") | |
output = model.generate(input_ids, max_length=128) | |
return tokenizer.decode(output[0], skip_special_tokens=True) | |
async def generate_llama3(system_input, user_input): | |
client = AsyncInferenceClient( | |
"meta-llama/Meta-Llama-3.1-8B-Instruct", | |
token=os.getenv('HF_TOKEN') | |
) | |
try: | |
async for message in await client.chat_completion( | |
messages=[ | |
{"role": "system", "content": system_input}, | |
{"role": "user", "content": user_input}, ], | |
max_tokens=256, | |
stream=True, | |
): | |
yield message.choices[0].delta.content | |
except json.JSONDecodeError: | |
pass | |