MolmoE-1B-0924 / preprocesssors.py

Cp over files

18652d8 17 days ago

No virus

84.8 kB

	import hashlib
	import json
	import math
	from functools import reduce
	from typing import Mapping, Optional, Sequence

	import numpy as np
	import tensorflow as tf
	import seqio
	import gin

	from .data_utils import flatten_parts, stateless_permutation, stateless_shuffle
	from .. import config


	def get_from_dict(data, keys):
	"""Iterate nested dictionary"""
	return reduce(dict.get, keys, data)

	def get_blank_image():
	image = tf.zeros([224, 224, 3], dtype=tf.uint8)
	image = tf.expand_dims(image, 0)[:1]
	return image


	@seqio.utils.map_over_dataset
	def rekey(x, key_map=None):
	"""Replace the feature keys according to the mapping in `key_map`.
	For example, if the dataset returns examples of the format:
	{'foo': 'something', 'bar': 'something else'}
	and key_map = {'boo': 'foo', 'spar': 'bar'} then this function will return
	examples with the format
	{'boo': 'something', 'spar': 'something else'}
	If a mapping is to an empty key or None, set the new key to an empty string.
	Args:
	x: an example to process.
	key_map: dictionary mapping new keys to original keys
	Returns:
	A preprocessed example with the format listed above.
	"""
	if key_map:
	out = {}
	for new_key, old_key in key_map.items():
	if isinstance(old_key, list):
	out[new_key] = get_from_dict(x, old_key)
	else:
	out[new_key] = x[old_key]
	return out
	return x


	def rename(**kwargs):
	@seqio.map_over_dataset
	def _fn(x):
	updates = {}
	for new_key, old_key in kwargs.items():
	if isinstance(old_key, list):
	val = x[old_key[0]]
	for k in old_key[1:-1]:
	val = val[k]
	updates[new_key] = val.pop(old_key[-1])
	else:
	updates[new_key] = x.pop(old_key)
	x.update(updates)
	return x
	return _fn


	def extract_transcripts(ds):
	ds = flatten_parts(ds, ["transcripts"])
	def _map(ex):
	return dict(
	image=ex["image"],
	text=ex["transcripts"],
	url=ex["url"]
	)
	return ds.map(_map)


	@seqio.map_over_dataset
	def extract_caption_and_all_transcripts(ex):
	transcripts = tf.random.shuffle(ex["transcripts"])[:3]
	weight = 1.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
	return dict(
	image=ex["image"],
	text=tf.concat([tf.expand_dims(ex["caption"], 0), transcripts], 0),
	url=ex["url"],
	text_weights=tf.pad(
	tf.ones((1,), dtype=tf.float32), [[0, tf.shape(transcripts)[0]]],
	constant_values=weight),
	)


	@seqio.map_over_dataset
	def extract_all_transcripts(ex):
	transcripts = tf.random.shuffle(ex["transcripts"])[:3]
	weight = 3.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
	return dict(
	image=ex["image"],
	text=transcripts,
	url=ex["url"],
	text_weights=tf.fill((tf.shape(transcripts)[0],), weight),
	)


	@seqio.map_over_dataset
	def extract_transcript(ex):
	transcripts = tf.random.shuffle(ex["transcripts"])
	return dict(
	image=ex["image"],
	text=transcripts[0],
	url=ex["url"],
	)


	@seqio.map_over_dataset
	def extract_caption(ex):
	caption = ex["caption"]
	if len(caption.shape) > 0:
	ex["text"] = caption[0]
	else:
	ex["text"] = caption
	return ex


	@seqio.map_over_dataset
	def extract_joint_captions(ex):
	caption = ex["caption"]
	if len(caption.shape) > 0:
	caption = caption[0]
	_ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
	_ix = _ix % tf.shape(ex["transcripts"])[0]
	return dict(
	image=ex["image"],
	text=tf.stack([caption, ex["mistral_caption"], ex["transcripts"][_ix]], 0),
	url=ex["url"]
	)


	@seqio.map_over_dataset(num_seeds=1)
	def extract_caption_and_transcript(ex, seed):
	caption = ex["caption"]
	if len(caption.shape) > 0:
	caption = caption[0]
	_ix = tf.random.stateless_uniform((), seed, 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
	return dict(
	image=ex["image"],
	text=tf.stack([caption, ex["transcripts"][_ix]], 0),
	url=ex["url"]
	)


	@seqio.map_over_dataset
	def caption_transcript_augmented(ex, sequence_length):
	caption = ex["caption"]
	if len(caption.shape) > 0:
	caption = caption[0]
	image = ex["image"]
	properties = []

	do_augmentation = sequence_length["is_training"]
	# do_augmentation = False

	# Keep this off, it screws up OCR
	# do_hflip = (tf.random.uniform(()) > 0.2 and do_augmentation)
	do_hflip = False
	if do_hflip:
	image = image[:, ::-1]

	# Mild color jitter
	do_color = (tf.random.uniform(()) > 0.5 and do_augmentation)
	if do_color:
	image = tf.image.random_hue(image, max_delta=0.05)
	image = tf.image.random_brightness(image, max_delta=0.2)
	image = tf.image.random_saturation(image, 0.7, 1.3)
	image = tf.image.random_contrast(image, 0.7, 1.3)

	# Mild affine transformation
	do_affine = (tf.random.uniform(()) > 0.5 and do_augmentation)
	if do_affine and do_augmentation:
	shift_x = tf.random.uniform((), -10, 10) * 0
	shift_y = tf.random.uniform((), -10, 10) * 0
	shear_x = tf.random.uniform((), -2, 2)
	shear_y = tf.random.uniform((), -2, 2)
	rotation = tf.random.uniform((), -6, 6)
	max_scale = 1.1
	scale = tf.random.uniform((), 0.8, max_scale)
	center = tf.cast(tf.shape(image), tf.float32)/2

	image = tf.keras.ops.image.affine_transform(
	image,
	tf.stack(get_affine_matrix(
	[center[0], center[1]],
	rotation,
	[shift_x, shift_y],
	1/scale,
	[shear_x, shear_y]
	) + [0., 0.]),
	interpolation='bilinear',
	fill_mode='constant',
	fill_value=1.,
	data_format='channels_last'
	)

	properties = tf.stack([
	("[hflip]" if do_hflip else ""),
	("[color]" if do_color else ""),
	("[affine]" if do_affine else "")
	])
	properties = tf.boolean_mask(properties, tf.strings.length(properties) > 0)
	prompt = tf.strings.reduce_join(properties, separator=" ")
	ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
	out = dict(
	image=image,
	text=tf.stack([caption, ex["transcripts"][ix]], 0),
	url=ex["url"],
	prompt=prompt,
	)
	# out["metadata/unaugmented_image"] = image
	return out


	def extract_caption_and_transcript_hflip(ds):

	# Just in case they are ordered somehow in Matt's data
	@seqio.map_over_dataset
	def _shuffle_transcripts(_ex):
	_ex["transcripts"] = tf.random.shuffle(_ex["transcripts"])
	_ex["hflip"] = tf.random.uniform((), 0, 3, dtype=tf.int32)
	return _ex

	ds = _shuffle_transcripts(ds)

	# Build a 3x long dataset with each individual transcript so we iterate through
	# each transcript
	@seqio.map_over_dataset
	def _with_transcript(ex, _ix):
	caption = ex["caption"]
	if len(caption.shape) > 0:
	caption = caption[0]
	hflip = ex["hflip"] == _ix
	if hflip:
	ex["image"] = ex["image"][:, ::-1]
	style = ["long_caption_flipped", "transcript_flipped"]
	else:
	style = ["long_caption", "transcript"]
	return dict(
	image=ex["image"],
	text=tf.stack([caption, ex["transcripts"][_ix]], 0),
	url=ex["url"],
	style=style
	)

	joint_ds = _with_transcript(ds, 0)
	for i in range(1, 3):
	joint_ds = joint_ds.concatenate(_with_transcript(ds, i))

	return joint_ds


	@seqio.map_over_dataset
	def extract_llava(ex, sequence_length, output_features):
	tf.assert_equal(tf.shape(ex['conversations']['value'])[0], 2)
	prompt = ex['conversations']['value'][0]
	text = ex['conversations']['value'][1]
	ex.pop('conversations')
	ex["text"] = text
	ex["prompt"] = prompt
	return ex


	def extract_localized_narrative(ds):
	ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)
	def _map(ex):
	return dict(
	image=ex["image"],
	text=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
	)
	return ds.map(_map)


	def float_to_text(val):
	return tf.strings.as_string(tf.cast(val * 100, tf.int32))


	@seqio.map_over_dataset
	def extract_vqa(ex):
	questions = ex["vqa"]["questions"]
	answers = ex["vqa"]["answers"]
	answers = tf.strings.reduce_join(answers, 1, separator="; ")
	qas = tf.strings.reduce_join(tf.stack([questions, answers], 1), separator=" ")
	return dict(
	image=ex["image"],
	text=tf.strings.reduce_join(qas, separator="\n")
	)


	@seqio.map_over_dataset
	def coco_image_id_from_path(ex):
	image_id = tf.strings.substr(ex["image/filename"], 0, tf.strings.length(ex["image/filename"])-4)
	ex["image_id"] = tf.strings.to_number(image_id)
	return ex


	@seqio.map_over_dataset
	def add_coco_url(ex):
	"""Turns a COCO path into a URL, which can then be used in visualizations"""
	path = ex["image/filename"]
	if not tf.strings.regex_full_match(path, "./."):
	prefix = tf.strings.regex_replace(path, "COCO_", "")
	prefix = tf.strings.regex_replace(prefix, "_[0-9]+.jpg", "")
	path = tf.strings.join([prefix, path], separator="/")

	# images are hosted by the COCO website here
	url = tf.strings.join(["https://s3.us-east-1.amazonaws.com/images.cocodataset.org/", path])
	ex["metadata/image_url"] = url
	return ex


	def flatten_vqa(ds):
	parts = ["questions", "answers"]
	for k in ["id", "question_id"]:
	if k in ds.element_spec:
	parts.append(k)
	return flatten_parts(ds, parts)


	def format_gqa(ds, is_balanced=True, flatten=True):
	if is_balanced:
	ds = ds.filter(lambda x: tf.reduce_any(x["questions"]["is_balanced"]))
	def _filter_qs(ex):
	qs = ex["questions"]
	mask = qs["is_balanced"]
	qs = {k: tf.boolean_mask(v, mask) for k, v in qs.items()}
	ex["questions"] = qs
	return ex
	ds = ds.map(_filter_qs)

	if flatten:
	ds = flatten_parts(ds, ["questions"])

	def _rename(ex):
	out = ex["questions"]
	out["image"] = ex["image"]
	out["image_id"] = ex["image_id"]
	return out
	return ds.map(_rename)


	@seqio.map_over_dataset
	def fix_doqa_url(x):
	x["image_url"] = tf.strings.regex_replace(x["image_url"], "gs://", "")
	return x


	def _add_metadata(ex):
	out = {}
	if "id" in ex:
	out["metadata/example_id"] = ex["id"]
	elif "example_id" in ex:
	out["metadata/example_id"] = ex["example_id"]
	elif "question_id" in ex:
	out["metadata/example_id"] = ex["question_id"]
	if "image_url" in ex:
	out["metadata/image_url"] = ex["image_url"]
	for k, v in ex.items():
	if k.startswith("metadata/"):
	out[k] = v
	return out


	def image_only(ds):
	return ds.filter(lambda x: x["has_image"])


	def filter_difficult_direct_answer(ds):
	return ds.filter(lambda x: not x["difficult_direct_answer"])


	@seqio.map_over_dataset()
	def format_ai2d(ex, variable_style=True):
	abc = tf.constant(list("abcdefg".upper()))
	out = dict(image=ex["image"])
	out.update(_add_metadata(ex))

	options = ex["choices"]
	# >= 3 in case of none of the above like answers
	n_options = tf.shape(ex["option_is_abc"])[0]
	if ex["abc_label"] and tf.reduce_sum(tf.cast(ex["option_is_abc"], tf.int32)) >= (n_options - 1):
	# The image labels are always upper, so use upper in the answer ptions
	options = tf.where(
	ex["option_is_abc"],
	tf.strings.upper(options),
	options
	)
	short_options = options
	style = "ai2_diagram_no_letter"
	else:
	short_options = abc[:tf.shape(options)[0]]
	options = tf.stack([short_options, options,], 1)
	options = tf.strings.reduce_join(options, axis=-1, separator=": ")
	style = "ai2_diagram"

	options = tf.strings.reduce_join(options, separator="\n")
	out["question"] = ex["question"]
	out["options"] = options
	if variable_style:
	out["style"] = style
	if ex["answer_idx"] < 0:
	out["text"] = "?"
	else:
	out["text"] = short_options[ex["answer_idx"]]
	out["metadata/answer_idx"] = ex["answer_idx"]
	tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".\\|\\|\\|.")), False)
	out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="\|\|\|")
	out["metadata/has_transparent_box"] = ex.get("has_transparent_box", tf.constant(False))
	out["metadata/abc_label"] = ex["abc_label"]
	return out


	@gin.configurable()
	@seqio.map_over_dataset()
	def format_multiple_choice_qa(ex, option_format="abc"):
	assert option_format == "abc"
	abc = tf.constant(list("abcdefg".upper()))
	out = dict(image=ex["image"])
	out.update(_add_metadata(ex))
	options = ex["choices"]
	short_options = abc[:tf.shape(options)[0]]
	options = tf.stack([short_options, options,], 1)
	options = tf.strings.reduce_join(options, axis=-1, separator=": ")
	options = tf.strings.reduce_join(options, separator="\n")
	out["question"] = ex["question"]
	out["options"] = options
	if ex["answer_idx"] < 0:
	out["text"] = "?"
	else:
	out["text"] = short_options[ex["answer_idx"]]
	out["metadata/answer_idx"] = ex["answer_idx"]
	tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".\\|\\|\\|.")), False)
	out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="\|\|\|")
	# out["metadata/option_names"] = tf.RaggedTensor.from_row_lengths(short_options, tf.shape(short_options))
	# out["metadata/option_names"] = short_options
	return out


	@seqio.map_over_dataset()
	def output_options(ex):
	ex["metadata/options"] = ex["options"]
	return ex


	@seqio.map_over_dataset()
	def extract_tally_qa(ex):
	questions = ex.pop("questions")
	ex["questions"] = questions["question"]
	ex["answers"] = tf.strings.as_string(questions["answer"])
	ex["question_id"] = questions["question_id"]
	return ex


	@seqio.map_over_dataset()
	def count_bench_preprocessor(ex):
	return {
	"image": ex["image"],
	"text": tf.strings.as_string(ex["number"]),
	"object": ex["noun"],
	"question": tf.strings.join([
	"How many ", ex["noun"], " are there?"
	]),
	"metadata/count": ex["number"],
	}


	def filter_human(ds):
	return ds.filter(lambda x: x["is_human"])


	def filter_aug(ds):
	return ds.filter(lambda x: not x["is_human"])


	@seqio.map_over_dataset()
	def reweight_chartqa(ex, human, aug):
	is_human = ex["metadata/is_human"]
	ex["text_weights"] = human if is_human else aug
	return ex


	@seqio.map_over_dataset()
	def chartqa_prompting(ex):
	question = tf.strings.join([ex["question"], " Answer:"])
	return dict(
	image=ex["image"],
	question=question,
	answer=ex["answer"]
	)


	@seqio.map_over_dataset()
	def chartqa_explanation(ex):
	question = tf.strings.join([ex["question"], " Explanation:"])
	out = {
	"image": ex["image"],
	"question": question,
	"answer": ex["answer"],
	}
	out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
	return out


	@seqio.map_over_dataset(num_seeds=1)
	def _preprocess_scifi(ex, seed):
	if "qa_pairs" in ex:
	q = ex["qa_pairs"]
	else:
	q = ex["qa"]
	ix = stateless_permutation(tf.shape(q["question"])[0], seed)
	return dict(
	image=ex["image"],
	question=tf.gather(q["question"], ix),
	explanation=tf.gather(q["explanation"], ix),
	answer=tf.gather(q["answer"], ix),
	)

	@seqio.map_over_dataset
	def scifi_explanation_only(ex):
	return dict(
	image=ex["image"],
	question=ex["question"],
	answer=ex["explanation"],
	)


	def filter_named_entity(ds):
	@seqio.map_over_dataset
	def _load_image(ex):
	ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	return ex

	ds = _load_image(ds)
	return ds.filter(lambda x: tf.reduce_min(tf.shape(x["image"])[:2]) >= 32)


	@seqio.map_over_dataset()
	def extract_named_entity(ex):
	qs = ex["questions"]
	return {
	"image": ex["image"],
	"metadata/image_url": ex["url"],
	"metadata/entity": ex["entity"],
	"questions": qs["question"],
	"answers": qs["answer"],
	}

	@gin.configurable()
	def extract_individual_vqa(ds, test=False, answer_mode="best"):

	@seqio.map_over_dataset(num_seeds=1)
	def _extract(ex, seed):
	if "questions" in ex:
	question = ex["questions"]
	else:
	question = ex["question"]
	out = dict(
	image=ex["image"],
	question=question,
	)
	out.update(_add_metadata(ex))
	out["metadata/question"] = question
	if ex.get("answers") is not None:
	out["metadata/references"] = tf.strings.reduce_join(ex["answers"], separator="\n")
	elif ex.get("answer") is not None:
	out["metadata/references"] = ex["answer"]

	if not test:
	if "answer" in ex:
	answer = ex["answer"]
	else:
	answer = ex["answers"]
	if answer.dtype in [tf.int32, tf.int64]:
	answer = tf.strings.as_string(answer)
	if len(answer.shape) == 1 and tf.shape(answer)[0] == 0:
	answer = tf.expand_dims("", 0)
	if len(answer.shape) == len(question.shape):
	pass
	# Handle questions with multiple answers
	elif answer_mode == "random":
	assert len(answer.shape) == 1
	answer = answer[tf.random.stateless_uniform((), seed, 0, tf.shape(answer)[0], dtype=tf.int32)]
	elif answer_mode == "best":
	def _get_best(_answer):
	vals, _, counts = tf.unique_with_counts(_answer)
	count_thresh = tf.reduce_max(counts)
	vals = tf.boolean_mask(vals, counts >= count_thresh)
	return vals[tf.random.stateless_uniform((), seed, 0, tf.shape(vals)[0], dtype=tf.int32)]
	if len(answer.shape) == 1:
	answer = _get_best(answer)
	elif isinstance(answer, tf.RaggedTensor):
	n = tf.shape(answer)[0]
	answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
	for i in range(n):
	answer_arr = answer_arr.write(i, _get_best(answer[i]))
	answer = answer_arr.stack()
	else:
	answer = tf.map_fn(_get_best, answer)
	elif answer_mode == "all_segments":
	out["text"] = answer
	elif answer_mode == "all_segments_weighted":
	out["text"] = answer
	out["text_weights"] = 1.0 / tf.cast(tf.shape(answer)[-1], tf.float32)
	elif answer_mode == "all":
	if len(answer.shape) == 1:
	answer = stateless_shuffle(answer, seed)
	answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
	elif isinstance(answer, tf.RaggedTensor):
	n = tf.shape(answer)[0]
	answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
	for i in range(n):
	answer_arr = answer_arr.write(i, tf.strings.reduce_join(tf.random.shuffle(answer[i]), separator="\n", axis=-1))
	answer = answer_arr.stack()
	else:
	answer = tf.map_fn(tf.random.shuffle, answer)
	answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
	else:
	raise NotImplementedError()
	out["text"] = answer
	return out
	return _extract(ds)


	@seqio.map_over_dataset()
	def extract_khan_academy(ex):
	return dict(
	image=ex["image"],
	image_url=ex["image_url"],
	prompt="Answer this question",
	text=ex["gptResponse"]
	)

	@seqio.map_over_dataset()
	def extract_vaia_qa_latex_image(ex, add_short_answer=False, set_short_answer_first=False):
	if ex["has_image"]:
	image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	image = tf.expand_dims(image, 0)[:1]
	else:
	# image = get_blank_image() # blank image
	image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	image = tf.expand_dims(image, 0)[:0]
	img_h = tf.shape(image)[1]
	img_w = tf.shape(image)[2]

	if add_short_answer:
	if set_short_answer_first:
	answer = tf.strings.join(["Answer: ", ex["short_answer"], "\n\n", ex["answer"]])
	else:
	answer = tf.strings.join([ex["answer"], "\n\n", "Answer: ", ex["short_answer"]])
	else:
	answer = ex["answer"]
	out = dict(
	image=image, # 4-d tensor
	text=answer,
	prompt=tf.strings.join([ex["latex_question"], "\n"]),
	)
	out["metadata/images"] = image
	out.update(_add_metadata(ex))
	out["metadata/batch_id"] = ex["batch_id"]
	out["metadata/image_size"] = [img_w, img_h]
	return out

	@seqio.map_over_dataset()
	def extract_vqa_online(ex):
	out = dict(
	image=ex["image"],
	prompt=tf.strings.join([ex["question"], "\n"]),
	text=ex["answer"]
	)
	out.update(_add_metadata(ex))
	out["metadata/row_id"] = ex["row_id"]
	return out


	@seqio.map_over_dataset()
	def extract_scifi_joint(ex):
	if "qa_pairs" in ex:
	q = ex["qa_pairs"]
	else:
	q = ex["qa"]
	prompts = tf.concat([["Describe this image in detail."], q["question"]], 0)
	responses = tf.concat([ex["summary"][None], q["answer"]], 0)
	return dict(
	image=ex["image"],
	prompt=prompts,
	text=responses,
	)


	def remove_no_qa(ds):
	def _filter(ex):
	if "qa_pairs" in ex:
	q = ex["qa_pairs"]
	else:
	q = ex["qa"]
	return tf.shape(q["question"])[0] > 0
	return ds.filter(_filter)


	@seqio.map_over_dataset()
	def extract_scifi_qa_exp(ex):
	return dict(
	image=ex["image"],
	question=ex["question"], # Array of questions
	answer=tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]]),
	)


	@seqio.map_over_dataset(num_seeds=1)
	def extract_scifi_qa_demo(ex, seed):
	# if tf.random.stateless_uniform((), 0, 1) > 0.5:
	answer = tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]])
	# else:
	# answer = ex["explanation"]
	return dict(
	image=ex["image"],
	question=ex["question"], # Array of questions
	answer=answer,
	)


	@seqio.map_over_dataset()
	def clock_bench_preprocessor(ex):
	out = dict(
	image=ex["image"],
	prompt="What time is being shown?",
	)
	for k in ["hour", "minute", "second", "answerable"]:
	out[f"metadata/{k}"] = ex[k]
	return out


	def deg2rad(x):
	return x*math.pi/180.0


	def get_affine_matrix(center, angle, translate, scale, shear):
	# From https://github.com/pytorch/vision/blob/f96c42fca53230057b16941b078a0a9eee06e20f/torchvision/transforms/functional.py#L1006
	rot = deg2rad(angle)
	sx = deg2rad(shear[0])
	sy = deg2rad(shear[1])

	cx, cy = center
	tx, ty = translate

	# RSS without scaling
	a = tf.cos(rot - sy) / tf.cos(sy)
	b = -tf.cos(rot - sy) * tf.tan(sx) / tf.cos(sy) - tf.sin(rot)
	c = tf.sin(rot - sy) / tf.cos(sy)
	d = -tf.sin(rot - sy) * tf.tan(sx) / tf.cos(sy) + tf.cos(rot)

	matrix = [a, b, 0.0, c, d, 0.0]
	matrix = [x * scale for x in matrix]
	# Apply inverse of center translation: RSS * C^-1
	matrix[2] += matrix[0] * (-cx) + matrix[1] * (-cy)
	matrix[5] += matrix[3] * (-cx) + matrix[4] * (-cy)
	# Apply translation and center : T * C * RSS * C^-1
	matrix[2] += cx + tx
	matrix[5] += cy + ty
	return matrix


	def quantize_point(coor, max_dim, mode="percent-precision-1"):
	max_dim = tf.cast(max_dim, tf.float32)
	coor = tf.cast(coor, tf.float32)
	x = (coor / max_dim)
	if mode == "percent-precision-1":
	return tf.strings.as_string(x*100, precision=1)
	elif mode == "zero_to_one":
	return tf.strings.as_string(x, precision=3)
	elif mode == "1k":
	return tf.strings.as_string(x*1000, precision=0)
	else:
	raise NotImplementedError(mode)


	def construct_pointing_format(label_text, alt_text, x_str, y_str):
	if alt_text is None:
	alt_text = label_text
	np = tf.shape(x_str)[0]
	if np == 0:
	output = ""
	elif np == 1:
	output = tf.strings.join([
	'<point x="', x_str[0], '" y="', y_str[0], '" alt="',
	alt_text, '">', label_text, '</point>'
	])
	else:
	ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
	xs = tf.strings.join(["x", ids, '="', x_str, '"'])
	ys = tf.strings.join(["y", ids, '="', y_str, '"'])
	points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
	output = tf.strings.join(
	["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
	return output


	def order_points(x, y, seed, point_order):
	if point_order == "natural":
	return x, y

	if point_order == "random":
	ix = stateless_permutation(tf.shape(x)[0], seed)
	elif point_order == "xy":
	x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
	ix = tf.argsort(x_float*100000 + y_float)
	elif point_order == "yx":
	x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
	ix = tf.argsort(y_float*100000 + x_float)
	else:
	raise NotImplementedError(point_order)
	return tf.gather(x, ix), tf.gather(y, ix)


	@gin.configurable()
	def points_to_text(x, y, w, h, seed, label=None, alt_text=None, point_mode="percent-precision-1",
	point_order="xy", point_list_mode="tag"):
	"""Returns a string encoding of a list of points"""
	x = quantize_point(x, w, point_mode)
	y = quantize_point(y, h, point_mode)
	# Order the quantized points to make the order matches what was generated, this can matter
	# when points have the same quantized value e.g, (10.001, 20) (10.002, 10) should be
	# represented (10, 10), (10, 20), but if we sort before quantization we get (10, 20), (10, 10)
	x, y = order_points(x, y, seed, point_order)
	if point_list_mode == "tag":
	return construct_pointing_format(label, alt_text, x, y)
	elif point_list_mode == "paren":
	n = tf.shape(x)[0]
	return tf.strings.reduce_join(tf.strings.join([
	"(", x, ", ", y, ")"
	]), separator=", ")
	# if n == 0:
	# output = ""
	# else:
	# ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
	# xs = tf.strings.join(["x", ids, '="', x_str, '"'])
	# ys = tf.strings.join(["y", ids, '="', y_str, '"'])
	# points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
	# output = tf.strings.join(
	# ["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
	# return output
	else:
	raise NotImplementedError(point_list_mode)


	def points_to_answer(x, y, w, h, seed, label, is_counting, alt_text=None):
	count = tf.shape(x)[0]
	if is_counting:
	if count == 0:
	return "There are none."
	else:
	point_text = points_to_text(x, y, w, h, seed, label, alt_text)
	return tf.strings.join([
	"Counting the ", point_text,
	" shows a total of ",
	tf.strings.as_string(count),
	"."
	])
	else:
	if count == 0:
	return "There are none."
	else:
	return points_to_text(x, y, w, h, seed, label, alt_text)


	@seqio.map_over_dataset(num_seeds=2)
	def extract_point_qa(ex, seeds, answer_type="y_major"):
	ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	img_h = tf.shape(ex["image"])[0]
	img_w = tf.shape(ex["image"])[1]

	questions = ex["questions"]
	question = questions["question"]
	n = tf.shape(question)[0]
	answers = tf.TensorArray(tf.string, size=n, element_shape=())
	point_text = questions["annotations"]["point_text"]
	point_seeds = tf.RaggedTensor.from_row_splits(
	row_splits=point_text.row_splits,
	values=tf.random.split(seeds[0], num=tf.shape(point_text.values)[0])
	)
	for question_ix in range(n):
	anno = questions["annotations"]
	answer = questions["answer_with_placeholders"][question_ix]
	n_anno = tf.shape(anno["point_text"][question_ix])[0]
	for anno_ix in range(n_anno):
	points = anno["points"][question_ix, anno_ix]
	point_text = points_to_answer(
	points[:, 0], points[:, 1], 100, 100,
	point_seeds[question_ix, anno_ix],
	anno["point_text"][question_ix, anno_ix],
	False,
	alt_text=anno["alt_text"][question_ix, anno_ix],
	)
	answer_split = tf.strings.split(answer, sep="<\|POINT\|>", maxsplit=1)
	answer = tf.strings.join([answer_split[0], point_text, answer_split[1]])
	# Make sure all placeholders where used
	tf.debugging.assert_equal(tf.shape(tf.strings.split(answer, sep="<\|POINT\|>"))[0], 1)
	answers = answers.write(question_ix, answer)

	messages = tf.stack([question, answers.stack()], axis=1)
	messages = tf.reshape(messages, [-1])
	conversation_ids = tf.range(tf.shape(messages)[0] // 2, dtype=tf.int32)
	conversation_ids = tf.repeat(conversation_ids, 2)
	out = dict(
	image=ex["image"],
	messages=tf.RaggedTensor.from_value_rowids(messages, conversation_ids)
	)
	ix = stateless_permutation(tf.shape(messages)[0], seeds[1])
	messages = tf.gather(messages, ix)
	out.update(_add_metadata(ex))
	out["metadata/image_size"] = [img_w, img_h]
	return out


	def select_point(mask):
	bs = tf.shape(mask)[0]
	valid = tf.cast(mask, tf.float32)
	h, w = tf.shape(mask)[1], tf.shape(mask)[2]
	ys = tf.range(h, dtype=tf.int32)
	xs = tf.range(w, dtype=tf.int32)

	n = tf.reduce_sum(valid, [1, 2])
	cy = tf.reduce_sum(tf.cast(ys[None, :, None], tf.float32) * valid, [1, 2]) / n # [bs]
	cx = tf.reduce_sum(tf.cast(xs[None, None, :], tf.float32) * valid, [1, 2]) / n # [bs]

	dist_y = tf.square(tf.range(h, dtype=tf.float32)[None, :] - cy[:, None]) # [bs, h]
	dist_x = tf.square(tf.range(w, dtype=tf.float32)[None, :] - cx[:, None]) # [bs, w]
	dist = dist_y[:, :, None] + dist_x[:, None, :] # [batch, h, w]
	dist = dist + (1 - valid) * 1e12
	min_dist = tf.argmin(tf.reshape(dist, [bs, -1]), axis=-1) # [batch]
	w = tf.cast(w, min_dist.dtype)
	cy = tf.cast(min_dist // w, tf.float32)
	cx = tf.cast(min_dist % w, tf.float32)
	return cx, cy


	@seqio.map_over_dataset
	def refexp_pointing(ex):
	img_h = tf.shape(ex["image"])[0]
	img_w = tf.shape(ex["image"])[1]
	objects = ex["objects"]

	# Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
	refexps = objects['refexp']['raw']
	bbox = objects["bbox"]
	mask = tf.squeeze(objects["mask"], -1)

	ix = tf.range(0, tf.shape(refexps)[0], dtype=tf.int32)
	ix = tf.random.shuffle(ix)
	refexps = tf.gather(refexps, ix)
	bbox = tf.gather(bbox, ix)
	mask = tf.gather(mask, ix)

	cx, cy = select_point(mask)
	answers = points_to_text(img_h, img_w, cx, cy)

	out = {
	"image": ex["image"],
	"refexp": refexps.values,
	"metadata/image_size": tf.stack([img_w, img_h,]),
	"text": tf.repeat(answers, refexps.row_lengths()),
	}
	if "image_url" in ex:
	out["metadata/image_url"] = ex["image_url"]
	return out


	@seqio.map_over_dataset
	def refexp_pointing_inf(ex):
	img_h = tf.shape(ex["image"])[0]
	img_w = tf.shape(ex["image"])[1]

	objects = ex["objects"]
	mask = tf.squeeze(objects["mask"], -1)
	cx, cy = select_point(mask)
	answers = points_to_text(img_h, img_w, cx, cy)

	refexps = objects["refexp"]["raw"]

	# We can't use `mask` directly since it is variable size, and thus it
	# will break batching. Here we serialize it instead
	serialized_masks = tf.map_fn(tf.io.serialize_tensor, mask, fn_output_signature=tf.string)
	out = {
	"image": ex["image"],
	"refexp": refexps,
	"metadata/bbox": objects["bbox"],
	"metadata/answer": answers,
	"metadata/mask": serialized_masks,
	"metadata/image_size": tf.stack([img_w, img_h]),
	}
	out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
	return out

	@seqio.map_over_dataset
	def extract_andriod_control_inf(ex, mode):
	if mode == "ll":
	prompt = tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]])
	elif mode == "hl_ll":
	prompt = tf.strings.join([
	"high_level: ", ex["metadata/hl_instruction"],
	" low_level: ", ex["metadata/ll_instruction"]
	])
	elif mode == "hl":
	prompt = tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]])
	elif mode == "hl_cot":
	prompt = tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]])
	else:
	raise NotImplementedError()

	out = dict(
	image=ex["image"],
	prompt=prompt,
	text=ex["metadata/target_action"]
	)
	out.update(_add_metadata(ex))
	return out

	@seqio.map_over_dataset
	def extract_android_control(ex):
	# Each image has three tasks:
	# low level -> action
	# high+low level -> action
	# high level -> action
	# high level -> low level + action (CoT)
	out = dict(
	image=ex["image"],
	prompt=tf.stack([
	tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]]),
	tf.strings.join([
	"high_level: ", ex["metadata/hl_instruction"],
	" low_level: ", ex["metadata/ll_instruction"]
	]),
	tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]]),
	tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]]),
	]),
	text=tf.stack([
	ex["metadata/target_action"],
	ex["metadata/target_action"],
	ex["metadata/target_action"],
	tf.strings.join(["Plan: ", ex["metadata/ll_instruction"], " Action: ", ex["metadata/target_action"]]),
	])
	)
	# Only needed if visualizing
	# ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	# img_h = tf.shape(ex["image"])[0]
	# img_w = tf.shape(ex["image"])[1]
	# out["metadata/image_size"] = tf.stack([img_w, img_h,])
	out.update(_add_metadata(ex))
	return out


	@seqio.map_over_dataset(num_seeds=1)
	def refexp(ex, seed):
	img_h = tf.shape(ex["image"])[0]
	img_w = tf.shape(ex["image"])[1]
	objects = ex["objects"]

	# Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
	refexps = objects['refexp']['raw']
	bbox = objects["bbox"]
	ix = stateless_permutation(tf.shape(refexps)[0], seed)
	refexps = tf.gather(refexps, ix)
	bbox = tf.gather(bbox, ix)

	x2 = bbox[:, 0] + bbox[:, 2]
	y2 = bbox[:, 1] + bbox[:, 3]
	with tf.control_dependencies([
	tf.debugging.assert_equal(tf.reduce_any(x2 <= tf.cast(img_w, tf.float32)), True),
	tf.debugging.assert_equal(tf.reduce_any(y2 <= tf.cast(img_h, tf.float32)), True)
	]):
	answers = points_to_text(
	img_h, img_w,
	tf.reshape(tf.stack([bbox[:, 0], x2], 1), [-1]),
	tf.reshape(tf.stack([bbox[:, 1], y2], 1), [-1]))
	answers = tf.strings.reduce_join(tf.reshape(answers, [-1, 2]), separator=" ", axis=1)

	out = {
	"image": ex["image"],
	"refexp": refexps.values,
	"metadata/bbox": bbox,
	"metadata/image_size": tf.stack([img_w, img_h,]),
	"text": tf.repeat(answers, refexps.row_lengths()),
	}

	if "image_url" in ex:
	out["image_url"] = ex["image_url"]
	return out


	@seqio.map_over_dataset
	def refexp_inf(ex):
	img_h = tf.shape(ex["image"])[0]
	img_w = tf.shape(ex["image"])[1]
	out = {
	"image": ex["image"],
	"refexp": ex["objects"]["refexp"]["raw"],
	"metadata/bbox": ex["objects"]["bbox"],
	"metadata/image_size": tf.stack([img_w, img_h,]),
	}
	out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
	return out


	def point_text_interleaved(*args):
	raise NotImplementedError()


	@seqio.map_over_dataset
	def web_pointing_preprocessor(ex):
	img_h = tf.shape(ex["image"])[0]
	img_w = tf.shape(ex["image"])[1]

	question = point_text_interleaved(
	img_h, img_w, ex["question"], ex["question_points"]["x"], ex["question_points"]["y"])
	answer = point_text_interleaved(
	img_h, img_w, ex["answer"], ex["answer_points"]["x"], ex["answer_points"]["y"])
	answer_points = tf.stack([ex["answer_points"]["x"], ex["answer_points"]["y"]], axis=1)
	return {
	"question": question,
	"answer": answer,
	"image": ex["image"],
	"metadata/image_size": [img_w, img_h],
	"metadata/question_type": ex["question_type"],
	"metadata/answer_points": tf.io.serialize_tensor(answer_points),
	"metadata/answer": answer,
	}


	def filter_pointing(ds):
	return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] >= 1)


	def filter_qa(ds):
	return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] == 0)

	# vaia filtering
	def filter_image_only(ds):
	return ds.filter(lambda ex: ex["has_image"])

	def filter_mc(ds):
	return ds.filter(lambda ex: ex["is_mc"])

	def remove_is_long(ds):
	return ds.filter(lambda ex: not ex["is_long"])

	def remove_has_multiple_parts(ds):
	return ds.filter(lambda ex: not ex["has_multiple_parts"])


	def _split(ds: tf.data.Dataset, keys, n_splits=2):
	def _map(ex):
	n = tf.shape(ex[keys[0]])[0]
	if n < n_splits:
	return tf.data.Dataset.from_tensors(ex)
	else:
	# import pdb; pdb.set_trace()
	bs = n // n_splits
	remainder = n - bs*n_splits
	lens = tf.concat([
	tf.ones([remainder], dtype=tf.int32),
	tf.zeros([n_splits-remainder], dtype=tf.int32),
	], axis=0) + bs
	tf.debugging.assert_equal(tf.reduce_sum(lens), n)
	ends = tf.cumsum(lens)

	parts = []
	for split_ix in range(n_splits):
	part_ex = dict(ex)
	e = ends[split_ix]
	s = e - lens[split_ix]
	for k in keys:
	if isinstance(k, tuple):
	assert len(k) == 2
	part_ex[k[0]][k[1]] = ex[k[0]][k[1]][s:e]
	else:
	part_ex[k] = ex[k][s:e]
	parts.append(part_ex)

	ds = tf.data.Dataset.from_tensors(parts[0])
	for sub_ds in parts[1:]:
	sub_ds = tf.data.Dataset.from_tensors(sub_ds)
	ds = ds.concatenate(sub_ds)
	return ds

	return ds.flat_map(_map)



	def split(ds, n=2):
	# return ds
	return _split(ds, [k for k in [
	"question",
	"label",
	"text",
	"entity",
	"messages"
	] if k in ds.element_spec], n_splits=n)


	def split_points(ds, max_points=50):
	label = "question" if "question" in ds.element_spec else "label"
	return _split(ds, [
	"question", label, "notInImage",
	("answer_points", "x"),
	("answer_points", "y"),
	])


	@seqio.map_over_dataset
	def fix_count_qa(ex):
	ex["label"] = ex["label"][::2]
	tf.debugging.assert_equal(tf.shape(ex["answer_points"]["x"])[0], tf.shape(ex["label"])[0])
	return ex


	def filter_points(ds, max_number=40):

	def _add_valid(ex):
	valid = (
	tf.reduce_all(ex["answer_points"]["x"] >= 0.0, axis=-1) &
	tf.reduce_all(ex["answer_points"]["x"] <= 100.0, axis=-1) &
	tf.reduce_all(ex["answer_points"]["y"] >= 0.0, axis=-1) &
	tf.reduce_all(ex["answer_points"]["y"] <= 100.0, axis=-1) &
	(ex["answer_points"]["y"].row_lengths() <= max_number)
	)
	ex["valid"] = valid
	return ex
	ds = ds.map(_add_valid)
	ds = ds.filter(lambda ex: tf.reduce_any(ex["valid"]))
	return ds


	# def filter_points(ds, max_number=30):
	# n_points = ds["answer_points"]["x"].row_lengths()
	# parts = tf.TensorArray(tf.int32, size=tf.shape(n_points[0]), element_shape=tf.TensorShape([None]))
	# total = 0
	# on_row = 0
	# for i in range(n_points):
	# n = n_points[i]
	# if n > max_number:
	# continue
	# if n + total > max_number:
	#
	# return ds


	@seqio.map_over_dataset(num_seeds=2)
	def pointing_preprocessor(ex, sequence_length, seeds, with_count=False):
	image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	img_h = tf.shape(image)[0]
	img_w = tf.shape(image)[1]

	ix = tf.where(ex["valid"])[:, 0]
	ix = stateless_shuffle(ix, seeds[0])
	if "label" in ex:
	question = tf.strings.lower(ex["label"])
	else:
	question = ex["question"]
	question = tf.gather(question, ix) # [n_question]
	points_x = tf.gather(ex["answer_points"]["x"], ix) # [n_question, n_points[ragged]]]
	points_y = tf.gather(ex["answer_points"]["y"], ix)
	not_in_image = tf.gather(ex["notInImage"], ix) # [n_question]

	n = tf.shape(points_x)[0]
	point_text = tf.TensorArray(dtype=tf.string, size=n, element_shape=()) # [n_question]
	point_seeds = tf.random.split(seeds[1], n)
	for i in range(n):
	answer = points_to_answer(points_x[i], points_y[i], 100, 100, point_seeds[i], question[i], with_count)
	point_text = point_text.write(i, answer)
	return {
	"image": image,
	"metadata/image_size": [img_w, img_h],
	"entity": question,
	"question": question,
	"text": point_text.stack(),
	}


	@seqio.map_over_dataset
	def pointing_inf_preprocessor(ex):
	ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	img_h = tf.shape(ex["image"])[0]
	img_w = tf.shape(ex["image"])[1]

	question = ex["question"]
	not_in_image = tf.shape(ex["answer_points"]["x"])[0] == 0

	# points are stored in normalized format, de-normalize here
	points_x = ex["answer_points"]["x"] * tf.cast(img_w, tf.float32) / 100.0
	points_y = ex["answer_points"]["y"] * tf.cast(img_h, tf.float32) / 100.0

	out = dict(
	image=ex["image"],
	question=question,
	entity=question,
	)
	out.update(_add_metadata(ex))
	out["metadata/not_in_image"] = not_in_image
	# We can't use `mask` directly since it is variable size, and thus it
	# will break batching. Here we serialize it instead
	serialized_masks = tf.map_fn(tf.io.serialize_tensor, ex["masks"], fn_output_signature=tf.string)
	serialized_masks = tf.strings.reduce_join(serialized_masks, separator="\|\|\|")
	out["metadata/mask"] = serialized_masks
	out["metadata/question"] = question
	out["metadata/answer_points"] = tf.io.serialize_tensor(tf.stack([points_x, points_y], 1))
	out["metadata/image_size"] = [img_w, img_h]

	return out


	@seqio.map_over_dataset(num_seeds=1)
	def count_qa_preprocessor_inf(ex, sequence_length, seed):
	image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	img_h = tf.shape(image)[0]
	img_w = tf.shape(image)[1]

	entity = tf.strings.substr(
	ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
	entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
	entity = tf.strings.lower(entity)
	tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)

	return {
	"image": image,
	"metadata/image_size": [img_w, img_h],
	"metadata/count": tf.strings.to_number(ex["answer"]),
	"question": ex["question"],
	"entity": entity,
	}


	@seqio.map_over_dataset(num_seeds=1)
	def count_qa_preprocessor(ex, sequence_length, seed, with_count=False,
	for_inference=False):
	point_answer = ex["point_answer"]
	numbers_str = tf.strings.regex_replace(point_answer, r'\.$', '')
	numbers_str = tf.strings.regex_replace(numbers_str, r'[^\d\.\s]+', '')
	numbers_str = tf.strings.strip(numbers_str)
	numbers = tf.strings.split(numbers_str)
	float_numbers = tf.strings.to_number(numbers, out_type=tf.float32)
	coordinates = tf.reshape(float_numbers, (-1, 3))
	points_x = coordinates[:, 1]
	points_y = coordinates[:, 2]

	image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	img_h = tf.shape(image)[0]
	img_w = tf.shape(image)[1]
	entity = tf.strings.substr(
	ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
	entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
	entity = tf.strings.lower(entity)
	tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)
	count = tf.strings.to_number(ex["answer"], out_type=tf.int32)
	if for_inference:
	return {
	"image": image,
	"metadata/image_size": [img_w, img_h],
	"metadata/count": count,
	"question": ex["question"],
	"entity": entity,
	}
	else:
	tf.debugging.assert_equal(count, tf.shape(points_x)[0])
	# points are already normalized so use w=1, h=1
	answer = points_to_answer(points_x, points_y, 1, 1, seed, entity, with_count)
	return {
	"image": image,
	"metadata/image_size": [img_w, img_h],
	"metadata/count": count,
	"question": ex["question"],
	"entity": entity,
	"text": answer,
	}


	@gin.configurable()
	@seqio.map_over_dataset
	def cleanup_preprocessor(ex, preprocess=False):
	if preprocess:
	ex["prompt"] = tf.strings.join(
	[
	"[[User]]: Correct the spelling and punctuation mistakes on the following transcript based on what appears in the image.\n\n{before} ",
	ex["prompt"],
	"\n[[Assistant]]: {after}"
	]
	)
	return ex
	else:
	return ex


	@gin.configurable()
	@seqio.map_over_dataset
	def random_text_preprocessor(ex, preprocess=False):
	ex["prompt"] = "What does the text say in this image?"
	if preprocess:
	ex["prompt"] = tf.strings.join(["[[User]]: ", ex["prompt"], "\n[[Assistant]]:"])
	return ex
	else:
	return ex


	@seqio.map_over_dataset(num_seeds=25)
	def clock_augmentation(ex, seeds):
	seeds = list(seeds)
	image = ex["image"]

	# Apply shear, rotation, and scale through one affine matrix
	height = tf.cast(tf.shape(image)[0], tf.float32)
	width = tf.cast(tf.shape(image)[1], tf.float32)

	_call_id = [0]

	def _rng(_minval=0, _maxval=1, shape=(), dtype=tf.float32):
	return tf.random.stateless_uniform(shape, seeds.pop(), _minval, _maxval, dtype=dtype)

	sel = _rng(0, 1)
	if sel < 0.1:
	# Straight on
	shear_x = 0.
	shear_y = 0.
	rotation = 0.
	elif sel < 0.5:
	# Normal looking
	shear_x = _rng(-10, 10)
	shear_y = _rng(-10, 10)
	rotation = _rng(-25, 25)
	else:
	# Allowed to be very wonky
	# if tf.random.stateless_uniform((), seeds.pop(), 0, 1) > 0.8:
	# image = image[:, ::-1]

	if _rng() > 0.5:
	shear_x = _rng( -30, 30)
	shear_y = _rng( -30, 30)
	else:
	shear_x = _rng( -10, 10)
	shear_y = _rng( -10, 10)
	rng = _rng( 0, 1)
	if rng < 0.2:
	rotation = _rng( -25, 25)
	elif rng < 0.6:
	rotation = _rng( -80, 80)
	else:
	rotation = _rng( -180, 180)

	if _rng() > 0.5:
	scale = _rng( 0.3, 2)
	else:
	scale = _rng( 0.3, 1)
	# Pad so upscaling/rotation will not move the image out of bounds
	pad = tf.cast(tf.maximum(height, width)*0.5, tf.int32)
	image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)
	height = tf.cast(tf.shape(image)[0], tf.float32)
	width = tf.cast(tf.shape(image)[1], tf.float32)

	image = tf.keras.ops.image.affine_transform(
	image,
	tf.stack(get_affine_matrix(
	[height/2, width/2],
	rotation,
	[0, 0],
	1/scale,
	[shear_x, shear_y]
	) + [0., 0.]),
	interpolation='bilinear',
	fill_mode='constant',
	fill_value=1.,
	data_format='channels_last'
	)

	# Crop, otherwise it would be impossible to put the image at the corner of the image
	not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
	no_white_ix = tf.where(not_white)
	top_left = tf.reduce_min(no_white_ix, axis=0)
	bottom_right = tf.reduce_max(no_white_ix, axis=0)
	image = tf.image.crop_to_bounding_box(
	image,
	offset_height=tf.cast(top_left[0], tf.int32),
	offset_width=tf.cast(top_left[1], tf.int32),
	target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
	target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
	)

	# Translate
	height, width = tf.shape(image)[0], tf.shape(image)[1]
	translation_seed = _rng(0, 1)
	if translation_seed < 0.2:
	h_pad = _rng(0, height//2, (2,), dtype=tf.int32)
	w_pad = _rng(0, width//2, (2,), dtype=tf.int32)
	else:
	h_pad = _rng(0, height*2, (2,), dtype=tf.int32)
	w_pad = _rng(0, width*2, (2,), dtype=tf.int32)
	image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
	constant_values=1)

	# Random background color
	# color_rng = tf.random.stateless_uniform((4,), seeds.pop(), 0, 1)
	# random_color = color_rng[:3]
	# valid = tf.reduce_all(tf.reduce_sum(tf.abs(random_color[None, None, :] - image), -1) > 0.03)
	# if color_rng[0] < 0.2 and valid:
	# image = tf.where(tf.reduce_all(image < 0.99, axis=-1, keepdims=True),
	# image, image * 0 + random_color[None, None, :])

	# Mild color hitter
	image = tf.image.stateless_random_hue(image, max_delta=0.05, seed=seeds.pop())
	image = tf.image.stateless_random_brightness(image, max_delta=0.15, seed=seeds.pop())
	image = tf.image.stateless_random_saturation(image, 0.8, 1.2, seed=seeds.pop())
	image = tf.image.stateless_random_contrast(image, 0.8, 1.2, seed=seeds.pop())

	# ex["metadata/unaugmented_image"] = ex["image"]
	ex["image"] = image
	return ex


	@seqio.map_over_dataset
	def clocks_preprocessor(ex):
	time_format = ex["time_format"]
	shows_seconds = ex["shows_seconds"]
	hour, minute, second = [tf.cast(ex[k], tf.int32) for k in ["hour", "minute", "second"]]
	if hour == 0: # Midnight of the previous day
	am_pm = "PM"
	hour_str = 12
	hour = 24
	elif hour > 12:
	am_pm = "PM"
	hour_str = hour - 12
	else:
	hour_str = hour
	am_pm = "AM"
	hour_str = tf.strings.as_string(hour_str)
	minute_str = tf.strings.as_string(minute)
	if tf.strings.length(minute_str) == 1:
	minute_str = tf.strings.join(["0", minute_str])

	second_str = tf.strings.as_string(second)
	if tf.strings.length(second_str) == 1:
	second_str = tf.strings.join(["0", second_str])

	prefix = "The time shown is "

	if time_format == "The time is not shown":
	text = "The time is not shown in the image."
	hour, minute, second = -1, -1, -1
	else:
	if not shows_seconds:
	second = -1
	if time_format == "12 hour clock (without AM/PM)" and shows_seconds:
	if hour > 12:
	hour = hour - 12
	time = tf.strings.join([hour_str, ":", minute_str, ":", second_str])
	elif time_format == "12 hour clock (with AM/PM)" and shows_seconds:
	time = tf.strings.join([hour_str, ":", minute_str, ":", second_str, " ", am_pm])
	elif time_format == "12 hour clock (with AM/PM)" and not shows_seconds:
	time = tf.strings.join([hour_str, ":", minute_str, " ", am_pm])
	elif time_format == "12 hour clock (without AM/PM)" and not shows_seconds:
	if hour > 12:
	hour = hour - 12
	time = tf.strings.join([hour_str, ":", minute_str])
	else:
	time = "" # Should never occur, but needed for tf analysis
	tf.debugging.assert_equal(tf.strings.length(time) > 0, True)
	text = tf.strings.join(["The time shown is ", time])
	image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	image = tf.image.convert_image_dtype(image, tf.float32)[:-120] # remove the black shadow at the bottom
	return {
	"image": image,
	"prompt": "What time is being shown?",
	"text": text,
	"metadata/time_format": time_format,
	"metadata/hour": hour,
	"metadata/minute": minute,
	"metadata/text": text,
	"metadata/second": second,
	}


	@seqio.map_over_dataset()
	def atlas_obscura_preprocessor(ex):
	out = dict(
	image=ex["image"],
	prompt="Where was this picture taken?",
	text=tf.strings.join([
	ex["place"],
	" in ",
	ex["city"]
	])
	)
	out["metadata/image_url"] = ex["image_url"]
	out["metadata/references"] = out["text"]
	return out


	@seqio.map_over_dataset()
	def famous_birthdays_preprocessor(ex):
	out = dict(
	image=ex["image"],
	image_url=ex["image_url"],
	prompt="Who is this?",
	text=ex["name"]
	)
	out["metadata/references"] = out["text"]
	return out


	@seqio.map_over_dataset()
	def mild_color_aug_preprocessor(ex):
	if "image_url" in ex: # URL won't show the augmentations
	del ex["image_url"]
	# ex["metadata/unaugmented_image"] = ex["image"]
	ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	ex["image"] = mild_color_aug(ex["image"])
	return ex


	def build_text_with_points(text, points, img_h, img_w):
	points = points_to_text(img_h, img_w, points[:, 0], points[:, 1])
	parts = tf.strings.split(text, sep="<ANS>")
	with_points = tf.strings.reduce_join(tf.reshape(tf.stack([
	parts,
	tf.pad(points, [[0, 1]], constant_values=""),
	], 1), [-1]), separator="")
	return tf.strings.split(with_points, "\n\n")


	@seqio.map_over_dataset()
	def synth_count_preprocessor(example):
	image_shape = tf.shape(example["image"])
	h, w = image_shape[0], image_shape[1]
	questions = build_text_with_points(example["questions"], example["question_points"], h, w)
	answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
	keep_q = tf.strings.regex_full_match(questions, "How many.*")
	keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
	keep = tf.logical_and(keep_q, keep_ans)
	questions = tf.boolean_mask(questions, keep)
	answers = tf.boolean_mask(answers, keep)
	ix = tf.range(0, tf.shape(answers)[0], dtype=tf.int32)
	ix = tf.random.shuffle(ix)
	return dict(
	image=example["image"],
	prompt=tf.gather(questions, ix),
	text=tf.gather(answers, ix),
	)


	def synth_count_inf_preprocessor(ds):

	@seqio.map_over_dataset(num_seeds=1)
	def get_two(example, seed):
	image_shape = tf.shape(example["image"])
	h, w = image_shape[0], image_shape[1]
	questions = build_text_with_points(example["questions"], example["question_points"], h, w)
	answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
	keep_q = tf.strings.regex_full_match(questions, "How many.*")
	keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
	keep = tf.logical_and(keep_q, keep_ans)
	questions = tf.boolean_mask(questions, keep)
	answers = tf.boolean_mask(answers, keep)

	ix = stateless_permutation(tf.shape(answers)[0], seed)[:2]
	return {
	"image": example["image"],
	"prompt": tf.gather(questions, ix),
	"metadata/references": tf.gather(answers, ix),
	}

	ds = get_two(ds)
	return flatten_parts(ds, ["prompt", "metadata/references"])


	def mild_color_aug(image):
	image = tf.image.random_hue(image, max_delta=0.05)
	image = tf.image.random_brightness(image, max_delta=0.15)
	image = tf.image.random_saturation(image, 0.7, 1.3)
	image = tf.image.random_contrast(image, 0.8, 1.2)
	return image


	@seqio.map_over_dataset()
	def name_entity_augmentation(ex, p_high_color=0.7):
	ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
	image = ex["image"]
	image = tf.image.convert_image_dtype(image, tf.float32)

	# Horizontal flip
	if tf.random.uniform((), 0, 1) > 0.85:
	image = image[:, ::-1]

	# Random crop
	height = tf.cast(tf.shape(image)[0], tf.float32)
	width = tf.cast(tf.shape(image)[1], tf.float32)
	crop_rng = tf.random.uniform((), 0, 1)
	if crop_rng < 0.2:
	pass
	else:
	if crop_rng < 0.4:
	h_crop = height * 0.15
	w_crop = width * 0.15
	else:
	h_crop = height * 0.4
	w_crop = width * 0.4
	crop_h = tf.cast(tf.random.uniform((2,), 0, h_crop/2), tf.int32)
	crop_w = tf.cast(tf.random.uniform((2,), 0, w_crop/2), tf.int32)
	image = image[crop_h[0]:-crop_h[1]-1, crop_w[0]:-crop_w[1]-1]
	height = tf.cast(tf.shape(image)[0], tf.float32)
	width = tf.cast(tf.shape(image)[1], tf.float32)

	if tf.random.uniform(()) > p_high_color:
	image = tf.image.random_hue(image, max_delta=0.05)
	image = tf.image.random_brightness(image, max_delta=0.15)
	image = tf.image.random_saturation(image, 0.7, 1.3)
	image = tf.image.random_contrast(image, 0.8, 1.2)
	else:
	image = tf.image.random_hue(image, max_delta=0.1)
	image = tf.image.random_brightness(image, max_delta=0.3)
	image = tf.image.random_saturation(image, 0.0, 2.0)
	image = tf.image.random_contrast(image, 0.2, 1.5)

	# Apply shear, rotation, and scale through one affine matrix
	sel = tf.random.uniform((), 0, 1)
	if sel < 0.1:
	pass
	else:
	if sel < 0.15: # Scale only
	shear_x = 0
	shear_y = 0
	rotation = 0
	if sel < 0.7: # Mild
	shear_x = tf.random.uniform((), -2, 2)
	shear_y = tf.random.uniform((), -2, 2)
	rotation = tf.random.uniform((), -5, 5)
	else: # Severe
	shear_x = tf.random.uniform((), -10, 10)
	shear_y = tf.random.uniform((), -10, 10)
	rotation = tf.random.uniform((), -20, 20)

	max_scale = 1.2
	scale = tf.random.uniform((), 0.4, max_scale)

	# Pad so upscaling/rotation will not move the image out of bounds
	pad = tf.cast(tf.maximum(height, width)*0.2, tf.int32)
	image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)

	image = tf.keras.ops.image.affine_transform(
	image,
	tf.stack(get_affine_matrix(
	[height/2, width/2],
	rotation,
	[0, 0],
	1/scale,
	[shear_x, shear_y]
	) + [0., 0.]),
	interpolation='bilinear',
	fill_mode='constant',
	fill_value=1.,
	data_format='channels_last'
	)

	# Crop, otherwise it would be impossible to put the image at the corner of the image
	not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
	no_white_ix = tf.where(not_white)
	top_left = tf.reduce_min(no_white_ix, axis=0)
	bottom_right = tf.reduce_max(no_white_ix, axis=0)

	# Very low chance center crop will get nothing but white space, we just skip
	if (
	(bottom_right[0] - top_left[0]) > 1 and (bottom_right[1] - top_left[1]) > 1
	):
	image = tf.image.crop_to_bounding_box(
	image,
	offset_height=tf.cast(top_left[0], tf.int32),
	offset_width=tf.cast(top_left[1], tf.int32),
	target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
	target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
	)

	# Translate
	height, width = tf.shape(image)[0], tf.shape(image)[1]
	if tf.random.uniform((), 0, 1) < 0.1:
	h_pad = tf.zeros((2,), dtype=tf.int32)
	w_pad = tf.zeros((2,), dtype=tf.int32)
	elif tf.random.uniform((), 0, 1) < 0.8:
	h_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
	w_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
	else:
	pad = tf.cast(tf.maximum(height, width), tf.int32)
	h_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
	w_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
	image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
	constant_values=1)

	if "image_url" in ex: # URL won't show the augmentations
	del ex["image_url"]
	# ex["metadata/unaugmented_image"] = ex["image"]
	ex["image"] = image
	return ex


	@seqio.map_over_dataset()
	def wiki_art_preprocessor(ex):
	out = dict(
	image=ex["image"],
	prompt="What is this?",
	text=ex["question"]
	)
	out["metadata/title"] = ex["title"]
	out["metadata/gt"] = ex["question"]
	out["metadata/artist"] = ex["artist"]
	out["metadata/painting_url"] = ex["painting_url"]
	# if "metadata/unaugmented_image" in ex:
	# out["metadata/unaugmented_image"] = ex["metadata/unaugmented_image"]
	return out

	@seqio.map_over_dataset()
	def oscar_preprocessor(ex):
	out = dict(
	image=ex["image"],
	prompt=ex["question"]
	)
	out.update(_add_metadata(ex))
	out["metadata/question"] = ex["question"]
	out["metadata/answer"] = ex["answer"]
	out["metadata/category"] = ex["category"]
	return out


	@seqio.map_over_dataset()
	def tulu_preprocessor(ex):
	return {
	"messages": ex["messages"]["content"],
	}
	# logging.info("Debugging tulue")
	# return {"messages": ex["messages"]["content"], "text_weights": 1e-6}


	WIKI_DATA_QUESTION = "What is this? Respond with just a proper name."


	@seqio.map_over_dataset()
	def extract_wiki_data(ex):
	return dict(
	image=ex["image"],
	image_url=ex["image_url"],
	prompt=[
	WIKI_DATA_QUESTION,
	"What is this? Respond with the proper name of the main focus of the image and a few details about it."
	],
	text=[
	tf.strings.strip(tf.strings.regex_replace(ex["question"], r"$.*$", "")),
	ex["gptResponse"],
	]
	)


	@seqio.map_over_dataset()
	def extract_wiki_data_name(ex):
	target = tf.strings.strip(tf.strings.regex_replace(ex["question"], r"$.*$", ""))
	out = dict(
	image=ex["image"],
	image_url=ex["image_url"],
	prompt=WIKI_DATA_QUESTION,
	text=target,
	)
	out["metadata/references"] = target
	return out


	@seqio.map_over_dataset()
	def extract_wiki_data_describe(ex):
	out = dict(
	image=ex["image"],
	image_url=ex["image_url"],
	prompt="What is this? Respond with the proper name of the main focus of the image and a few details about it.",
	)
	out["metadata/references"] = ex["gptResponse"]
	return out


	@gin.configurable()
	def format_multiple_style_qa(ds, types=['multiple_choice', 'short_answer'], styles=['ai2_diagram', 'vqa2'], default_style='vqa2',
	strip_instruction=False):
	def _extract(ex):
	prompt = ex["question"]
	out = dict(image=ex["image"])
	out.update(_add_metadata(ex))

	out["text"] = ex["answer"]
	out["metadata/references"] = ex["answer"]

	if ex["metadata/question_type"] == 'multiple_choice':
	style = styles[0]
	else:
	style = styles[1]
	if strip_instruction:
	if ex["metadata/question_type"] == "multiple_choice":
	# parts = tf.strings.split(prompt, "\n")
	# parts 1 is blank and part -1 is the instruction
	# prompt = tf.strings.reduce_join(tf.concat([parts[:1], parts[2:-1]], 0), separator="\n")
	prompt = prompt
	else:
	prompt = tf.strings.split(prompt, "\n")[0]

	out["style"] = style
	out["prompt"] = prompt
	return out
	ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
	return ds


	@gin.configurable()
	def extract_mmmu(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
	assert option_format == "abc"
	keys_tensor = tf.constant(types, dtype=tf.string)
	values_tensor = tf.constant(styles, dtype=tf.string)
	table = tf.lookup.StaticHashTable(
	tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
	default_value=tf.constant(default_style, dtype=tf.string),
	)
	def _extract(ex):
	out = dict(image=tf.expand_dims(ex["image_1"], 0))
	out.update(_add_metadata(ex))
	style = table.lookup(ex["metadata/question_type"])
	out["style"] = style
	out["text"] = ex["answer"]
	out["metadata/references"] = ex["answer"]

	if style == styles[0]:
	abc = tf.constant(list("abcdefghi".upper()))
	options = ex["options"]
	num_options = tf.shape(options)[0]
	dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
	out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
	out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])

	short_options = abc[:num_options]
	options = tf.stack([short_options, options,], 1)
	options = tf.strings.reduce_join(options, axis=-1, separator=": ")
	options = tf.strings.reduce_join(options, separator="\n")
	out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
	if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
	# Following LLaVa, don't use any images if there are multiple images paths
	# I think the rationale is that this means the image are answer-options
	out["image"] = out["image"][:0]
	else:
	out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
	out["prompt"] = ex["question"]
	out["image"] = out["image"][:0]
	return out
	ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
	return ds

	@gin.configurable()
	def extract_mmmu_cot(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
	assert option_format == "abc"
	keys_tensor = tf.constant(types, dtype=tf.string)
	values_tensor = tf.constant(styles, dtype=tf.string)
	table = tf.lookup.StaticHashTable(
	tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
	default_value=tf.constant(default_style, dtype=tf.string),
	)
	def _extract(ex):
	# out = dict(image=tf.expand_dims(ex["image_with_question"], 0))
	out = dict(image=tf.expand_dims(ex["image_1"], 0))
	out.update(_add_metadata(ex))
	style = table.lookup(ex["metadata/question_type"])
	# out["style"] = style
	out["text"] = ex["answer"]
	out["metadata/question"] = ex["question"]
	out["metadata/references"] = ex["answer"]

	if style == styles[0]:
	abc = tf.constant(list("abcdefghi".upper()))
	options = ex["options"]
	num_options = tf.shape(options)[0]
	dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
	out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
	out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])

	short_options = abc[:num_options]
	options = tf.stack([short_options, options,], 1)
	options = tf.strings.reduce_join(options, axis=-1, separator=": ")
	options = tf.strings.reduce_join(options, separator="\n")
	out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
	# out["prompt"] = ex["question"]
	if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
	# Following LLaVa, don't use any images if there are multiple images paths
	# I think the rationale is that this means the image are answer-options
	out["image"] = out["image"][:0]
	else:
	out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
	out["prompt"] = ex["question"]
	# out["image"] = out["image"][:0]
	return out
	ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
	return ds


	@seqio.map_over_dataset
	def reformat_math_vista(ex):
	query = ex["query"]
	query = tf.strings.split(query, sep="Question:")[-1]
	query = tf.strings.strip(tf.strings.split(query, sep="Hint:")[0])
	ex["query"] = query
	return ex


	@seqio.map_over_dataset
	def extract_math_vista(ex, styles=['ai2_diagram', 'vqa2']):
	out = dict(image=ex["image"])
	out.update(_add_metadata(ex))

	is_mc = ex["metadata/question_type"] == 'multi_choice'
	if is_mc:
	style = styles[0]
	abc = tf.constant(list("abcdefghi".upper()))
	options = ex["choices"]
	num_options = tf.shape(options)[0]
	dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
	out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
	out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])

	if ex["metadata/split"] != "test":
	short_options = abc[:num_options]
	answer_short_option = tf.boolean_mask(short_options, options == ex["answer"])[0]
	out["text"] = answer_short_option
	else:
	out["text"] = ex["answer"]
	else:
	style = styles[1]
	out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
	out["text"] = ex["answer"]
	out["style"] = style
	out["prompt"] = ex["query"]
	out["metadata/query"] = ex["query"]
	out["metadata/references"] = ex["answer"]
	return out


	NO_POINT_PREFIX = [
	"No pointing: ",
	"No pointing: ",
	"no pointing:\n",
	"No pointing:\n",
	"Not pointing:\n",
	"No Points: ",
	"No Points: ",
	"NO POINTING\n",
	"No pontiing\n",
	"No Points:\n ",
	"No pointing\n",
	"Do not point. ",
	"Refrain from pointing. ",
	"Avoid generating points . ",
	"For this question, do not use points. ",
	"Refrain from using points:\n",
	"Don't include points in your response. ",
	"Don't point. ",
	"Don't use points. ",
	"Please don't use points.\n\n",
	"Please don't use points.\n\n",
	"Respond without using points. ",
	"Respond without pointing:\n",
	"Do not generate ponits: ",
	"Do not point. ",
	"Do not point\n",
	"no pointing\n\n",
	"Answer without points: ",
	"Answer this question without pointing: ",
	"Answer without poiints. ",
	"answer without points: ",
	"answer with text only, do not points\n"
	]
	assert all(x[-1].isspace() for x in NO_POINT_PREFIX)
	NO_POINT_PREFIX_TF = tf.constant(NO_POINT_PREFIX)


	def prefix_how_many(messages, seed):
	question = messages[0]
	if tf.strings.regex_full_match(tf.strings.lower(question), "how many.*"):
	ix = tf.random.stateless_uniform((), seed, 0, len(NO_POINT_PREFIX), tf.int32)
	question = tf.strings.join([NO_POINT_PREFIX_TF[ix], question])
	return tf.concat([tf.expand_dims(question, 0), messages[1:]], axis=0)
	else:
	return messages


	@seqio.map_over_dataset(num_seeds=1)
	def prefix_how_many_messages(ex, seed):
	messages = ex["messages"]
	n = tf.shape(messages)[0]
	seeds = tf.random.split(seed, n)
	message_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=(None,))
	for i in range(n):
	message_arr = message_arr.write(i, prefix_how_many(messages[i], seeds[i]))
	ex["messages"] = tf.RaggedTensor.from_row_splits(
	values=message_arr.concat(), row_splits=messages.row_splits)
	return ex


	def filter_single_turn(ds):
	@seqio.map_over_dataset
	def _filter(ex):
	multi_turn = ex["messages"].row_lengths() > 2
	ex["messages"] = tf.ragged.boolean_mask(ex["messages"], multi_turn)
	return ex

	ds = _filter(ds)
	ds = ds.filter(lambda x: tf.shape(x["messages"])[0] > 0)
	return ds


	@seqio.map_over_dataset(num_seeds=1)
	def extract_cockatoo_qa_v2(ex, seed):
	messages = tf.RaggedTensor.from_value_rowids(ex["messages"], ex["conversation_ids"])
	ix = stateless_permutation(tf.shape(messages)[0], seed)
	messages = tf.gather(messages, ix)
	out = dict(
	image=ex["image"],
	messages=messages
	)
	out.update(_add_metadata(ex))
	return out


	def format_mmbench(ds):

	def _trim(ex):
	num_passes = tf.shape(ex["id"])[0]
	ex["choices"] = ex["choices"][:num_passes, :num_passes]
	ex["answer"] = ex["answer"][:num_passes]
	return ex

	ds = ds.map(_trim)
	ds = flatten_parts(ds, ["id", "query", "choices", "answer"])

	def _extract(ex):
	out = dict(image=ex["image"])
	out.update(_add_metadata(ex))
	out["prompt"] = ex["query"]
	out["text"] = ex["answer"]
	options = ex["choices"]
	tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".\\|\\|\\|.")), False)
	out["metadata/options"] = tf.strings.reduce_join(options, separator="\|\|\|")
	out["metadata/question"] = ex["question"]
	out["metadata/references"] = ex["answer"]
	return out

	ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
	return ds


	@seqio.map_over_dataset
	def extract_lvis(ex, class_name_file="gs://oe-training-chrisc/cockatoo/data/lvis_class_names.json"):
	with tf.io.gfile.GFile(class_name_file) as f:
	class_names = json.load(f)
	class_names_arr = [None]*len(class_names)
	for k, v in class_names.items():
	class_names_arr[int(k)] = v
	assert all(x is not None for x in class_names_arr)
	class_names_arr = tf.constant(class_names_arr)

	return dict(
	image=ex["image"],
	bbox=ex["objects"]["bbox"],
	label=tf.gather(class_names_arr, ex["objects"]["label"]),
	)


	def extract_open_images_boxes(ds):
	# ds = ds.filter(lambda ex: tf.logical_or(
	# tf.shape(ex["cap/cap_caption"])[0] > 0,
	# tf.shape(ex["detection/bbox"])[0] > 0
	# ))
	ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)

	@seqio.map_over_dataset
	def _map(ex):
	bbox = tf.reshape(ex["detection/bbox"], (-1, 4))
	bbox = tf.stack([
	bbox[:, 2],
	bbox[:, 0],
	bbox[:, 3],
	bbox[:, 1]
	], 1)
	return dict(
	image=tf.image.decode_jpeg(ex["image"]),
	bbox=bbox,
	label=ex["detection/label"],
	caption=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
	)

	return _map(ds)


	@seqio.map_over_dataset
	def region_captions_to_dense(ex):
	if "captions" in ex:
	captions = ex["captions"]["text"]
	boxes = ex["captions"]["bbox"]
	else:
	captions = ex["label"]
	boxes = ex["bbox"]


	sh = tf.cast(tf.shape(ex["image"])[:2], tf.float32)
	# image_h, image_w = sh[0], sh[1]
	w = boxes[:, 2] - boxes[:, 0]
	h = boxes[:, 3] - boxes[:, 1]

	cx = tf.cast(boxes[:, 0] + w/2, tf.float32)
	cy = tf.cast(boxes[:, 1] + h/2, tf.float32)
	# w = w / image_w
	# h = h / image_h
	coor = tf.strings.reduce_join(
	float_to_text(tf.stack([cx, cy, w, h], 1)), separator=",", axis=1)

	area = w*h
	if tf.random.uniform(()) < 0.5:
	coor_text = "before"
	captions = tf.strings.join([coor, captions], separator=": ")
	else:
	coor_text = "after"
	captions = tf.strings.join([captions, coor], separator=": ")

	ix = tf.random.uniform((), 0, 6, tf.int32)
	center = boxes
	if ix == 0:
	order_text = "left"
	sort_by = boxes[:, 0]
	elif ix == 1:
	order_text = "right"
	sort_by = -boxes[:, 2]
	elif ix == 2:
	order_text = "top"
	sort_by = boxes[:, 1]
	elif ix == 3:
	order_text = "bottom"
	sort_by = -boxes[:, 3]
	elif ix == 4:
	order_text = "largest"
	sort_by = area
	else:
	order_text = "smallest"
	sort_by = -area
	ixs = tf.argsort(sort_by)
	captions = tf.gather(captions, ixs)
	text = tf.strings.join([
	order_text,
	coor_text,
	tf.strings.reduce_join(captions, separator="\n")
	], separator="; ")

	if "caption" in ex:
	if tf.random.uniform(()) > 0.5:
	text = tf.strings.join([text, "\ncaption: ", ex["caption"]])
	else:
	text = tf.strings.join(["caption: ", ex["caption"], "\n", text])

	return dict(
	image=ex["image"],
	text=text
	)


	@seqio.map_over_dataset()
	def join_captions(ex):
	text = tf.random.shuffle(ex['text'])
	ex["text"] = tf.strings.reduce_join(text, separator="\n")
	return ex


	@seqio.map_over_dataset(num_seeds=1)
	def extract_figureqa(ex, seed):
	questions = ex["questions"]
	n = stateless_permutation(tf.shape(questions["question"])[0], seed)
	return dict(
	image=ex["image"],
	questions=tf.gather(questions["question"], n),
	question_id=tf.gather(questions["question_id"], n),
	answer=tf.gather(tf.strings.as_string(questions["answer"]), n)
	)


	@seqio.map_over_dataset
	def convert_figureqa_answer(ex):
	keys_tensor = tf.constant(["0", "1"])
	values_tensor = tf.constant(["no", "yes"])
	table = tf.lookup.StaticHashTable(
	tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
	default_value=tf.constant("nan", dtype=tf.string),
	)
	answer = table.lookup(ex["answer"])
	ex["answer"] = answer
	return ex


	@seqio.map_over_dataset()
	def build_question_with_hint(ex):
	hint = ex["hint"]
	if tf.strings.length(hint) > 0:
	ex["question"] = tf.strings.join([hint, ex["question"]], separator="\n")
	return ex

	@seqio.map_over_dataset()
	def build_question_with_context(ex):
	context = ex["context"]
	if tf.strings.length(context) > 0:
	ex["question"] = tf.strings.join([context, ex["question"]], separator="\n")
	return ex


	def max_words(ds, max_words):
	return ds.filter(lambda x: x["n_words"] <= max_words)


	@seqio.map_over_dataset
	def format_pdfa_eng_wds(example):
	return dict(
	image=example["image"],
	text=tf.strings.reduce_join(example["lines"]["text"], separator="\n"),
	)


	@gin.configurable()
	def accuracy_conditioned_joint(ds, sequence_length, is_eval=False, eval_quality=17,
	transcript_quality=None):
	# v2: Transcripts no longer get a quality score
	is_training = sequence_length.get('is_training', True)
	if not is_training:
	if is_eval:
	prompt = f"quality {eval_quality}:"
	else:
	prompt = f"quality 17:"

	@seqio.map_over_dataset
	def _with_prompt(ex):
	out = dict(
	image=ex["image"],
	url=ex["url"],
	prompt=prompt,
	)
	if "text" in ex:
	out["text"] = ex["text"]
	elif "caption" in ex:
	out["text"] = ex["caption"]
	return out
	return _with_prompt(ds)

	elif is_eval:
	raise ValueError("is_eval=True and is_training=False")

	# each transcript
	@seqio.map_over_dataset
	def _with_transcript(ex):
	if tf.shape(ex["edited_captions"]["caption"])[0] > 0:
	edited_caption = ex["edited_captions"]["caption"][0]
	n = ex["edited_captions"]["n_edits"][0]
	else:
	edited_caption = ""
	n = 0
	text = [
	ex["caption"],
	ex["transcripts"][tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)],
	edited_caption
	]
	edit_quality = 17 - n
	prompt = [
	"quality 17:",
	"" if transcript_quality is None else f"quality: {edit_quality}:",
	tf.strings.join(["quality ", tf.strings.as_string(edit_quality), ":"])
	]
	return dict(
	image=ex["image"],
	text=tf.stack(text, 0),
	url=ex["url"],
	prompt=tf.stack(prompt, 0),
	style=["long_caption", "transcript", "long_caption"]
	)
	return _with_transcript(ds)


	def select_dense_caption_sample(ds, samples=200):
	def compute_hash(string: str) -> str:
	return hashlib.sha256(string.encode("utf-8")).hexdigest()

	with tf.io.gfile.GFile("gs://oe-training-chrisc/cockatoo/data/dense-caption-eval-v0-final-data.json") as f:
	data = json.load(f)
	for ex in data:
	ex["image_id"] = compute_hash(ex["image"])
	data.sort(key=lambda x: x["image_id"])
	np.random.RandomState(12312).shuffle(data)
	keep = tf.constant([x["image"] for x in data[:samples]])

	def _keep(ex):
	return tf.reduce_any(ex["url"] == keep)
	ds = ds.filter(_keep)
	ds = tf.data.experimental.assert_cardinality(samples)(ds)
	return ds

	@seqio.map_over_dataset()
	def charxiv_preprocessor(ex):
	question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4", "reasoning_q"]
	answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4", "reasoning_a"]

	questions = [ex[name] for name in question_names]
	answers = [ex[name] for name in answer_names]

	return dict(
	image=ex["image"],
	question=tf.stack(questions, 0),
	answer=tf.stack(answers, 0)
	)

	@seqio.map_over_dataset()
	def charxiv_descriptive_preprocessor(ex):
	question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4"]
	answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4"]

	questions = [ex[name] for name in question_names]
	answers = [ex[name] for name in answer_names]

	return dict(
	image=ex["image"],
	question=tf.stack(questions, 0),
	answer=tf.stack(answers, 0)
	)

	@seqio.map_over_dataset()
	def charxiv_reasoning_preprocessor(ex):
	return dict(
	image=ex["image"],
	question=ex["reasoning_q"],
	answer=ex["reasoning_a"]
	)

	@seqio.map_over_dataset()
	def tablevqa_preprocessor(ex):
	return dict(
	image=ex["image"],
	question=ex["question"],
	answer=ex["gt"]
	)

	@seqio.map_over_dataset()
	def vtabfact_preprocessor(ex):
	return dict(
	image=ex["image"],
	question=tf.strings.join([ex["question"], "Answer with yes or no."], separator="\n"),
	answer=ex["gt"]
	)

	@seqio.map_over_dataset()
	def nutrition_fact_preprocessor(ex):
	question_names = ["descriptive_q", "reasoning_q"]
	answer_names = ["descriptive_a", "reasoning_a"]

	questions = [ex[name] for name in question_names]
	answers = [ex[name] for name in answer_names]

	return dict(
	image=ex["image"],
	question=tf.stack(questions, 0),
	answer=tf.stack(answers, 0)
	)