import os import subprocess # Instala os pacotes necessários subprocess.run(["pip", "install", "--upgrade", "pip"]) subprocess.run(["pip", "install", "--upgrade", "torch", "transformers", "accelerate"]) subprocess.run(["pip", "install", "git+https://github.com/TimDettmers/bitsandbytes.git"]) import accelerate import bitsandbytes import gradio as gr from transformers import LlamaForCausalLM, LlamaTokenizer # Define a variável de ambiente para desabilitar CUDA os.environ["TRANSFORMERS_NO_CUDA"] = "1" # Carrega o modelo e o tokenizador model = LlamaForCausalLM.from_pretrained("Ramikan-BR/tinyllama_PY-CODER-bnb-4bit-lora_4k-q4_k_m-v2") tokenizer = LlamaTokenizer.from_pretrained("Ramikan-BR/tinyllama_PY-CODER-bnb-4bit-lora_4k-q4_k_m-v2") def predict(input_text): # Codifica o texto de entrada e gera a saída input_ids = tokenizer.encode(input_text, return_tensors="pt") output = model.generate(input_ids, max_length=4096, do_sample=True, top_k=50, top_p=0.50, num_return_sequences=1) return tokenizer.decode(output[0], skip_special_tokens=True) # Cria a interface Gradio iface = gr.Interface(fn=predict, inputs="text", outputs="text") iface.launch()