RaushanTurganbay HF staff Xenova HF staff commited on
Commit
07f3bcb
1 Parent(s): 2c9ba3b

Add transformers.js tags and example code (#7)

Browse files

- Add transformers.js tags and example code (097a868a7a83d9b1368b225b7635fa24f3ffc0d5)


Co-authored-by: Joshua <[email protected]>

Files changed (1) hide show
  1. README.md +82 -0
README.md CHANGED
@@ -6,11 +6,13 @@ license: apache-2.0
6
  tags:
7
  - vision
8
  - image-text-to-text
 
9
  datasets:
10
  - lmms-lab/LLaVA-OneVision-Data
11
  pipeline_tag: image-text-to-text
12
  inference: false
13
  arxiv: 2408.03326
 
14
  ---
15
  # LLaVA-Onevision Model Card
16
 
@@ -156,6 +158,86 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
156
  ).to(0)
157
  ```
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  # Citation
160
  ```
161
  @misc{li2024llavaonevisioneasyvisualtask,
 
6
  tags:
7
  - vision
8
  - image-text-to-text
9
+ - transformers.js
10
  datasets:
11
  - lmms-lab/LLaVA-OneVision-Data
12
  pipeline_tag: image-text-to-text
13
  inference: false
14
  arxiv: 2408.03326
15
+ library_name: transformers
16
  ---
17
  # LLaVA-Onevision Model Card
18
 
 
158
  ).to(0)
159
  ```
160
 
161
+
162
+ ### Usage w/ Transformers.js
163
+
164
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
165
+ ```bash
166
+ npm i @huggingface/transformers
167
+ ```
168
+
169
+ **Example:** Multi-round conversations w/ PKV caching
170
+ ```js
171
+ import { AutoProcessor, AutoTokenizer, LlavaOnevisionForConditionalGeneration, RawImage } from '@huggingface/transformers';
172
+
173
+ // Load tokenizer, processor and model
174
+ const model_id = 'llava-hf/llava-onevision-qwen2-0.5b-ov-hf';
175
+
176
+ const tokenizer = await AutoTokenizer.from_pretrained(model_id);
177
+ const processor = await AutoProcessor.from_pretrained(model_id);
178
+ const model = await LlavaOnevisionForConditionalGeneration.from_pretrained(model_id, {
179
+ dtype: {
180
+ embed_tokens: 'fp16', // or 'fp32' or 'q8'
181
+ vision_encoder: 'fp16', // or 'fp32' or 'q8'
182
+ decoder_model_merged: 'q4', // or 'q8'
183
+ },
184
+ // device: 'webgpu',
185
+ });
186
+
187
+ // Prepare text inputs
188
+ const prompt = 'What does the text say?';
189
+ const messages = [
190
+ { role: 'system', content: 'Answer the question.' },
191
+ { role: 'user', content: `<image>\n${prompt}` }
192
+ ]
193
+ const text = tokenizer.apply_chat_template(messages, { tokenize: false, add_generation_prompt: true });
194
+ const text_inputs = tokenizer(text);
195
+
196
+ // Prepare vision inputs
197
+ const url = 'https://huggingface.co/qnguyen3/nanoLLaVA/resolve/main/example_1.png';
198
+ const image = await RawImage.fromURL(url);
199
+ const vision_inputs = await processor(image);
200
+
201
+ // Generate response
202
+ const { past_key_values, sequences } = await model.generate({
203
+ ...text_inputs,
204
+ ...vision_inputs,
205
+ do_sample: false,
206
+ max_new_tokens: 64,
207
+ return_dict_in_generate: true,
208
+ });
209
+
210
+ // Decode output
211
+ const answer = tokenizer.decode(
212
+ sequences.slice(0, [text_inputs.input_ids.dims[1], null]),
213
+ { skip_special_tokens: true },
214
+ );
215
+ console.log(answer);
216
+ // The text says "small but mighty" in a playful font.
217
+
218
+ const new_messages = [
219
+ ...messages,
220
+ { role: 'assistant', content: answer },
221
+ { role: 'user', content: 'How does the text correlate to the context of the image?' }
222
+ ]
223
+ const new_text = tokenizer.apply_chat_template(new_messages, { tokenize: false, add_generation_prompt: true });
224
+ const new_text_inputs = tokenizer(new_text);
225
+
226
+ // Generate another response
227
+ const output = await model.generate({
228
+ ...new_text_inputs,
229
+ past_key_values,
230
+ do_sample: false,
231
+ max_new_tokens: 256,
232
+ });
233
+ const new_answer = tokenizer.decode(
234
+ output.slice(0, [new_text_inputs.input_ids.dims[1], null]),
235
+ { skip_special_tokens: true },
236
+ );
237
+ console.log(new_answer);
238
+ // The text "small but mighty" is likely a playful or humorous reference to the image of the blue mouse with the orange dumbbell. It could be used as a motivational phrase or a playful way to express the idea that even small things can be impressive or powerful.
239
+ ```
240
+
241
  # Citation
242
  ```
243
  @misc{li2024llavaonevisioneasyvisualtask,