model_name = "gemma2:27b"
import os
os.system("sudo apt install lshw")
os.system("curl https://ollama.ai/install.sh | sh")
import nest_asyncio
nest_asyncio.apply()
import asyncio
# Run Async Ollama
# Taken from: https://stackoverflow.com/questions/77697302/how-to-run-ollama-in-google-colab
# NB: You may need to set these depending and get cuda working depending which backend you are running.
# Set environment variable for NVIDIA library
# Set environment variables for CUDA
os.environ['PATH'] += ':/usr/local/cuda/bin'
# Set LD_LIBRARY_PATH to include both /usr/lib64-nvidia and CUDA lib directories
os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia:/usr/local/cuda/lib64'
async def run_process(cmd):
print('>>> starting', *cmd)
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
# define an async pipe function
async def pipe(lines):
async for line in lines:
print(line.decode().strip())
await asyncio.gather(
pipe(process.stdout),
pipe(process.stderr),
)
# call it
await asyncio.gather(pipe(process.stdout), pipe(process.stderr))
import threading
async def start_ollama_serve():
await run_process(['ollama', 'serve'])
def run_async_in_thread(loop, coro):
asyncio.set_event_loop(loop)
loop.run_until_complete(coro)
loop.close()
# Create a new event loop that will run in a new thread
new_loop = asyncio.new_event_loop()
# Start ollama serve in a separate thread so the cell won't block execution
thread = threading.Thread(target=run_async_in_thread, args=(new_loop, start_ollama_serve()))
thread.start()
# Load up model
os.system(f"ollama pull {model_name}")
import copy
import gradio as gr
import spaces
from llama_index.llms.ollama import Ollama
import llama_index
from llama_index.core.llms import ChatMessage
HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL_ID = "google/gemma-2-27b-it"
MODEL_NAME = MODEL_ID.split("/")[-1]
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
gemma2 = Ollama(model=model_name, request_timeout=30.0)
TITLE = "
Chatbox
"
DESCRIPTION = f"""
Gemma is the large language model built by Google.
Feel free to test without log.
"""
CSS = """
.duplicate-button {
margin: auto !important;
color: white !important;
background: black !important;
border-radius: 100vh !important;
}
h3 {
text-align: center;
}
"""
@spaces.GPU(duration=90)
def stream_chat(message: str, history: list, temperature: float, context_window: int, top_p: float, top_k: int, penalty: float):
print(f'message is - {message}')
print(f'history is - {history}')
conversation = []
for prompt, answer in history:
conversation.extend([
ChatMessage(
role="user", content=prompt
),
ChatMessage(role="assistant", content=answer),
])
messages = [ChatMessage(role="user", content=message)]
print(f"Conversation is -\n{conversation}")
resp = gemma2.stream_chat(
message = messages,
chat_history = conversation,
top_p=top_p,
top_k=top_k,
repeat_penalty=penalty,
context_window=context_window,
)
for r in resp:
yield r.delta
chatbot = gr.Chatbot(height=600)
with gr.Blocks(css=CSS, theme="soft") as demo:
gr.HTML(TITLE)
gr.HTML(DESCRIPTION)
gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
gr.ChatInterface(
fn=stream_chat,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Slider(
minimum=0,
maximum=1,
step=0.1,
value=0.8,
label="Temperature",
render=False,
),
gr.Slider(
minimum=128,
maximum=2048,
step=1,
value=1024,
label="Context window",
render=False,
),
gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.1,
value=0.8,
label="top_p",
render=False,
),
gr.Slider(
minimum=1,
maximum=20,
step=1,
value=20,
label="top_k",
render=False,
),
gr.Slider(
minimum=0.0,
maximum=2.0,
step=0.1,
value=1.0,
label="Repetition penalty",
render=False,
),
],
examples=[
["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],
["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."],
["Tell me a random fun fact about the Roman Empire."],
["Show me a code snippet of a website's sticky header in CSS and JavaScript."],
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch()