import gradio as gr import os from loguru import logger from langchain_community.llms import LlamaCpp from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler from langchain_core.prompts import PromptTemplate import spaces import json # Create a directory for logs if it doesn't exist if not os.path.exists('logs'): os.makedirs('logs') # Define the log file path log_file = 'logs/file_{time}.log' # Configure the logger to write to the log file logger.add(log_file, rotation="500 MB") template = """Question: {question} Answer: Let's work this out in a step by step way to be sure we have the right answer.""" prompt = PromptTemplate.from_template(template) # Callbacks support token-wise streaming callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) # n_gpu_layers = -1 # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU. # n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. # Make sure the model path is correct for your system! llm = LlamaCpp( model_path="/home/user/app/models/Phi-3-mini-4k-instruct-q4.gguf", callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager ) llm_chain = prompt | llm @spaces.GPU() def greet(name): question = name response = llm_chain.invoke({"question": question}) logger.info(f"Response --> {response}") return demo = gr.Interface(fn=greet, inputs="text", outputs="text") demo.launch()