In [65]:
from PIL import Image

import torch
from transformers import (
    AutoModelForImageClassification,
    AutoImageProcessor,
)
import numpy as np

In [2]:
MODEL_NAME = "p1atdev/siglip-tagger-test-2"

In [44]:
model = AutoModelForImageClassification.from_pretrained(
    MODEL_NAME, torch_dtype=torch.bfloat16, trust_remote_code=True
)
model.eval()
processor = AutoImageProcessor.from_pretrained(MODEL_NAME)

In [45]:
image = Image.open("sample.jpg")
inputs = processor(image, return_tensors="pt").to(model.device, model.dtype)

In [70]:
logits = model(**inputs).logits.detach().cpu().float()[0]
logits = np.clip(logits, 0.0, 1.0)

In [80]:
results = {
    model.config.id2label[i]: logit for i, logit in enumerate(logits) if logit > 0
}
results = sorted(results.items(), key=lambda x: x[1], reverse=True)

In [81]:
for tag, score in results:
    print(f"{tag}: {score*100:.2f}%")

1girl: 100.00%
outdoors: 100.00%
sky: 100.00%
solo: 100.00%
school uniform: 96.88%
skirt: 92.97%
day: 89.06%
cloud: 85.94%
scenery: 79.69%
pleated skirt: 72.27%
black hair: 66.80%
standing: 65.62%
sailor collar: 59.38%
sitting: 57.81%
long sleeves: 53.52%
serafuku: 53.12%
holding: 52.34%
tree: 47.46%
dress: 46.48%
shoes: 43.55%
building: 42.77%
neckerchief: 40.82%
short hair: 38.09%
water: 38.09%
cloudy sky: 37.30%
looking at viewer: 32.23%
long hair: 32.03%
brown eyes: 31.45%
plant: 31.05%
bag: 29.30%
railing: 29.10%
sunlight: 28.12%
from side: 27.73%
window: 27.54%
brown hair: 26.37%
white shirt: 25.78%
shirt: 25.39%
blue sky: 23.93%
hairclip: 23.44%
blunt bangs: 21.58%
picture frame: 19.34%
hand up: 18.26%
black skirt: 17.87%
smile: 17.87%
from behind: 13.57%
cowboy shot: 10.99%
indoors: 10.74%
curtains: 10.25%
facing away: 9.23%
white socks: 6.08%
bottle: 6.01%
mountain: 5.66%
blue skirt: 5.13%
drinking straw: 3.37%
kneehighs: 1.71%
