find-my-butterfly

Running

App Files Files Community

find-my-butterfly / similarity_utils.py

sasha HF staff

Duplicate from sayakpaul/fetch-similar-images

3ae84a3 almost 2 years ago

raw

history blame

5.23 kB

	from typing import List, Union

	import datasets
	import numpy as np
	import torch
	import torchvision.transforms as T
	from PIL import Image
	from tqdm.auto import tqdm
	from transformers import AutoFeatureExtractor, AutoModel

	seed = 42
	hash_size = 8
	hidden_dim = 768 # ViT-base
	np.random.seed(seed)


	# Device.
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load model for computing embeddings..
	model_ckpt = "nateraw/vit-base-beans"
	extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)

	# Data transformation chain.
	transformation_chain = T.Compose(
	[
	# We first resize the input image to 256x256 and then we take center crop.
	T.Resize(int((256 / 224) * extractor.size["height"])),
	T.CenterCrop(extractor.size["height"]),
	T.ToTensor(),
	T.Normalize(mean=extractor.image_mean, std=extractor.image_std),
	]
	)


	# Define random vectors to project with.
	random_vectors = np.random.randn(hash_size, hidden_dim).T


	def hash_func(embedding, random_vectors=random_vectors):
	"""Randomly projects the embeddings and then computes bit-wise hashes."""
	if not isinstance(embedding, np.ndarray):
	embedding = np.array(embedding)
	if len(embedding.shape) < 2:
	embedding = np.expand_dims(embedding, 0)

	# Random projection.
	bools = np.dot(embedding, random_vectors) > 0
	return [bool2int(bool_vec) for bool_vec in bools]


	def bool2int(x):
	y = 0
	for i, j in enumerate(x):
	if j:
	y += 1 << i
	return y


	def compute_hash(model: Union[torch.nn.Module, str]):
	"""Computes hash on a given dataset."""
	device = model.device

	def pp(example_batch):
	# Prepare the input images for the model.
	image_batch = example_batch["image"]
	image_batch_transformed = torch.stack(
	[transformation_chain(image) for image in image_batch]
	)
	new_batch = {"pixel_values": image_batch_transformed.to(device)}

	# Compute embeddings and pool them i.e., take the representations from the [CLS]
	# token.
	with torch.no_grad():
	embeddings = model(**new_batch).last_hidden_state[:, 0].cpu().numpy()

	# Compute hashes for the batch of images.
	hashes = [hash_func(embeddings[i]) for i in range(len(embeddings))]
	example_batch["hashes"] = hashes
	return example_batch

	return pp


	class Table:
	def __init__(self, hash_size: int):
	self.table = {}
	self.hash_size = hash_size

	def add(self, id: int, hashes: List[int], label: int):
	# Create a unique indentifier.
	entry = {"id_label": str(id) + "_" + str(label)}

	# Add the hash values to the current table.
	for h in hashes:
	if h in self.table:
	self.table[h].append(entry)
	else:
	self.table[h] = [entry]

	def query(self, hashes: List[int]):
	results = []

	# Loop over the query hashes and determine if they exist in
	# the current table.
	for h in hashes:
	if h in self.table:
	results.extend(self.table[h])
	return results


	class LSH:
	def __init__(self, hash_size, num_tables):
	self.num_tables = num_tables
	self.tables = []
	for i in range(self.num_tables):
	self.tables.append(Table(hash_size))

	def add(self, id: int, hash: List[int], label: int):
	for table in self.tables:
	table.add(id, hash, label)

	def query(self, hashes: List[int]):
	results = []
	for table in self.tables:
	results.extend(table.query(hashes))
	return results


	class BuildLSHTable:
	def __init__(
	self,
	model: Union[torch.nn.Module, None],
	batch_size: int = 48,
	hash_size: int = hash_size,
	dim: int = hidden_dim,
	num_tables: int = 10,
	):
	self.hash_size = hash_size
	self.dim = dim
	self.num_tables = num_tables
	self.lsh = LSH(self.hash_size, self.num_tables)

	self.batch_size = batch_size
	self.hash_fn = compute_hash(model.to(device))

	def build(self, ds: datasets.DatasetDict):
	dataset_hashed = ds.map(self.hash_fn, batched=True, batch_size=self.batch_size)

	for id in tqdm(range(len(dataset_hashed))):
	hash, label = dataset_hashed[id]["hashes"], dataset_hashed[id]["labels"]
	self.lsh.add(id, hash, label)

	def query(self, image, verbose=True):
	if isinstance(image, str):
	image = Image.open(image).convert("RGB")

	# Compute the hashes of the query image and fetch the results.
	example_batch = dict(image=[image])
	hashes = self.hash_fn(example_batch)["hashes"][0]

	results = self.lsh.query(hashes)
	if verbose:
	print("Matches:", len(results))

	# Calculate Jaccard index to quantify the similarity.
	counts = {}
	for r in results:
	if r["id_label"] in counts:
	counts[r["id_label"]] += 1
	else:
	counts[r["id_label"]] = 1
	for k in counts:
	counts[k] = float(counts[k]) / self.dim
	return counts