TheVixhal commited on
Commit
7864b9c
1 Parent(s): ae73da3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -0
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
3
+ from PIL import Image
4
+ import requests
5
+
6
+ # Load the processor
7
+ processor = AutoProcessor.from_pretrained(
8
+ 'allenai/Molmo-7B-D-0924',
9
+ trust_remote_code=True,
10
+ torch_dtype='auto',
11
+ device_map='auto'
12
+ )
13
+
14
+ # Load the model
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ 'allenai/Molmo-7B-D-0924',
17
+ trust_remote_code=True,
18
+ torch_dtype='auto',
19
+ device_map='auto'
20
+ )
21
+
22
+ def describe_image(image):
23
+ # Process the image and text
24
+ inputs = processor.process(
25
+ images=[image],
26
+ text='''an image of a human sitting properly , with a laptop/pc clearly visible and the student’s face at least 40%-50% visible. The student should be looking at the laptop screen with both hands on the keyboard. There should be no other accessories other than laptop/pc, and no other second person should be present ." // analyse image on this conditions // if all condition satisfied answer YES else NO// Answer only in YES or NO'''
27
+ )
28
+
29
+ # Move inputs to the correct device and make a batch of size 1
30
+ inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
31
+
32
+ # Generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
33
+ output = model.generate_from_batch(
34
+ inputs,
35
+ GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
36
+ tokenizer=processor.tokenizer
37
+ )
38
+
39
+ # Only get generated tokens; decode them to text
40
+ generated_tokens = output[0, inputs['input_ids'].size(1):]
41
+ generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
42
+
43
+ return generated_text
44
+
45
+ # Create the Gradio interface
46
+ iface = gr.Interface(
47
+ fn=describe_image,
48
+ inputs=gr.Image(type="pil", label="Upload an Image"),
49
+ outputs=gr.Textbox(label="Description"),
50
+ title="OPPE",
51
+ description="OPPE VERRFICATION."
52
+ )
53
+
54
+ # Launch the interface
55
+ iface.launch()