Spaces:
Runtime error
Runtime error
from transformers import BlipProcessor, BlipForConditionalGeneration, DetrImageProcessor, DetrForObjectDetection | |
from PIL import Image | |
import torch | |
def get_image_caption(image_path): | |
""" | |
Generates a short caption for the provided image. | |
Args: | |
image_path (str): The path to the image file. | |
Returns: | |
str: A string representing the caption for the image. | |
""" | |
image = Image.open(image_path).convert('RGB') | |
model_name = "Salesforce/blip-image-captioning-large" | |
device = "cpu" # cuda | |
processor = BlipProcessor.from_pretrained(model_name) | |
model = BlipForConditionalGeneration.from_pretrained(model_name).to(device) | |
inputs = processor(image, return_tensors='pt').to(device) | |
output = model.generate(**inputs, max_new_tokens=20) | |
caption = processor.decode(output[0], skip_special_tokens=True) | |
return caption | |
def detect_objects(image_path): | |
""" | |
Detects objects in the provided image. | |
Args: | |
image_path (str): The path to the image file. | |
Returns: | |
str: A string with all the detected objects. Each object as '[x1, x2, y1, y2, class_name, confindence_score]'. | |
""" | |
image = Image.open(image_path).convert('RGB') | |
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") | |
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") | |
inputs = processor(images=image, return_tensors="pt") | |
outputs = model(**inputs) | |
# convert outputs (bounding boxes and class logits) to COCO API | |
# let's only keep detections with score > 0.9 | |
target_sizes = torch.tensor([image.size[::-1]]) | |
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0] | |
detections = "" | |
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): | |
detections += '[{}, {}, {}, {}]'.format(int(box[0]), int(box[1]), int(box[2]), int(box[3])) | |
detections += ' {}'.format(model.config.id2label[int(label)]) | |
detections += ' {}\n'.format(float(score)) | |
return detections | |
if __name__ == '__main__': | |
image_path = '/home/phillip/Desktop/todays_tutorial/52_langchain_ask_questions_video/code/test.jpg' | |
detections = detect_objects(image_path) | |
print(detections) | |