File size: 3,047 Bytes
b461971 6ca20df b28fc4b b461971 2c6047b b461971 2c6047b b461971 455c955 2c6047b b461971 2c6047b b461971 2c6047b b461971 2c6047b a94341a b461971 030f3b8 0e71cb1 030f3b8 2c6047b b461971 2c6047b 4b6516d 2c6047b 5318b89 b461971 2c6047b b461971 2c6047b b8a909b b461971 2c6047b b461971 2c6047b b461971 2c6047b b461971 2c6047b b461971 2c6047b c1fe742 2c6047b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
---
library_name: transformers
tags:
- llama-factory
- yi-vl
- llava
license: other
language:
- zh
- en
pipeline_tag: visual-question-answering
---
This is the Huggingface version of [Yi-VL-6B](https://huggingface.co/01-ai/Yi-VL-6B) model.
You may use this model for fine-tuning in downstream tasks, we recommend using our efficient fine-tuning toolkit. https://github.com/hiyouga/LLaMA-Factory
- **Developed by:** [01-AI](https://www.01.ai/).
- **Language(s) (NLP):** Chinese/English
- **License:** [Yi Series Model License](https://huggingface.co/01-ai/Yi-VL-34B/blob/main/LICENSE)
Usage:
```python
import requests
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, LlavaConfig
import transformers
from torch import nn
class LlavaMultiModalProjectorYiVL(nn.Module):
def __init__(self, config: "LlavaConfig"):
super().__init__()
self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
self.linear_3 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_4 = nn.LayerNorm(config.text_config.hidden_size, bias=True)
self.act = nn.GELU()
def forward(self, image_features):
hidden_states = self.linear_1(image_features)
hidden_states = self.linear_2(hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.linear_3(hidden_states)
hidden_states = self.linear_4(hidden_states)
return hidden_states
# Monkey patch of LlavaMultiModalProjector is mandatory
transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorYiVL
model_id = "BUAADreamer/Yi-VL-6B-hf"
messages = [
{ "role": "user", "content": "<image>What's in the picture?" }
]
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
model = AutoModelForVision2Seq.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(0)
processor = AutoProcessor.from_pretrained(model_id)
text = [processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)]
images = [Image.open(requests.get(image_file, stream=True).raw)]
inputs = processor(text=text, images=images, return_tensors='pt').to(0, torch.float16)
output = model.generate(**inputs, max_new_tokens=200)
output = processor.batch_decode(output, skip_special_tokens=True)
print(output.split("Assistant:")[-1].strip())
```
You could also alternatively launch a Web demo by using the CLI command in [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)
```bash
llamafactory-cli webchat \
--model_name_or_path BUAADreamer/Yi-VL-6B-hf \
--template yivl \
--visual_inputs
```
# [lmms-eval Evaluation Results](https://github.com/EvolvingLMMs-Lab/lmms-eval)
| Metric |Value|
|---------------------------------|----:|
| MMMU_val |36.8|
|CMMMU_val |32.2| |