colab tpu

!pip install torch_xla

import torch
import torch_xla
import torch_xla.core.xla_model as xm
from transformers import pipeline

تعيين الجهاز إلى TPU

device = xm.xla_device()
print(f"Using device: {device}")

تحميل الـ pipeline مع تمكين التخزين المؤقت واستخدام cache_position

pipe = pipeline(
"text-generation",
model="unsloth/Qwen2.5-32B-Instruct",
trust_remote_code=True,
device=-1, # استخدم -1 للـ TPU
use_cache=True, # تمكين التخزين المؤقت لتحسين الأداء
max_length=100, # زيادة الحد الأقصى للطول
torch_dtype=torch.float16 # استخدام دقة عائمة 16 بت
)

التحقق من الجهاز

if "xla" in str(device):
print(f"Model loaded on TPU: {device}")
else:
print(f"Model loaded on CPU: {device}")

الرسالة التي تريد إرسالها للنموذج مع إضافة cache_position

messages = [
{"role": "user", "content": "Who are you?", "cache_position": 0} # تعيين cache_position هنا
]

توليد الرد باستخدام max_new_tokens بدلاً من max_length

try:
output = pipe(messages, max_new_tokens=50) # تحديد عدد الرموز الجديدة التي سيتم توليدها
print(output)
except RuntimeError as e:
print(f"حدث خطأ: {e}")

unsloth
/

Qwen2.5-32B-Instruct

run on tpu

تعيين الجهاز إلى TPU

تحميل الـ pipeline مع تمكين التخزين المؤقت واستخدام cache_position

التحقق من الجهاز

الرسالة التي تريد إرسالها للنموذج مع إضافة cache_position

توليد الرد باستخدام max_new_tokens بدلاً من max_length