allenai
/

MolmoE-1B-0924

@@ -1,2472 +0,0 @@
-import hashlib
-import json
-import math
-from functools import reduce
-from typing import Mapping, Optional, Sequence
-import numpy as np
-import tensorflow as tf
-import seqio
-import gin
-from .data_utils import flatten_parts, stateless_permutation, stateless_shuffle
-from .. import config
-def get_from_dict(data, keys):
-    """Iterate nested dictionary"""
-    return reduce(dict.get, keys, data)
-def get_blank_image():
-    image = tf.zeros([224, 224, 3], dtype=tf.uint8)
-    image = tf.expand_dims(image, 0)[:1]
-    return image
-@seqio.utils.map_over_dataset
-def rekey(x, key_map=None):
-    """Replace the feature keys according to the mapping in `key_map`.
-    For example, if the dataset returns examples of the format:
-    {'foo': 'something', 'bar': 'something else'}
-    and key_map = {'boo': 'foo', 'spar': 'bar'} then this function will return
-    examples with the format
-    {'boo': 'something', 'spar': 'something else'}
-    If a mapping is to an empty key or None, set the new key to an empty string.
-    Args:
-        x: an example to process.
-        key_map: dictionary mapping new keys to original keys
-    Returns:
-        A preprocessed example with the format listed above.
-    """
-    if key_map:
-        out = {}
-        for new_key, old_key in key_map.items():
-            if isinstance(old_key, list):
-                out[new_key] = get_from_dict(x, old_key)
-            else:
-                out[new_key] = x[old_key]
-        return out
-    return x
-def rename(**kwargs):
-    @seqio.map_over_dataset
-    def _fn(x):
-        updates = {}
-        for new_key, old_key in kwargs.items():
-            if isinstance(old_key, list):
-                val = x[old_key[0]]
-                for k in old_key[1:-1]:
-                    val = val[k]
-                updates[new_key] = val.pop(old_key[-1])
-            else:
-                updates[new_key] = x.pop(old_key)
-        x.update(updates)
-        return x
-    return _fn
-def extract_transcripts(ds):
-    ds = flatten_parts(ds, ["transcripts"])
-    def _map(ex):
-        return dict(
-            image=ex["image"],
-            text=ex["transcripts"],
-            url=ex["url"]
-        )
-    return ds.map(_map)
-@seqio.map_over_dataset
-def extract_caption_and_all_transcripts(ex):
-    transcripts = tf.random.shuffle(ex["transcripts"])[:3]
-    weight = 1.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
-    return dict(
-        image=ex["image"],
-        text=tf.concat([tf.expand_dims(ex["caption"], 0), transcripts], 0),
-        url=ex["url"],
-        text_weights=tf.pad(
-            tf.ones((1,), dtype=tf.float32), [[0, tf.shape(transcripts)[0]]],
-            constant_values=weight),
-    )
-@seqio.map_over_dataset
-def extract_all_transcripts(ex):
-    transcripts = tf.random.shuffle(ex["transcripts"])[:3]
-    weight = 3.0 / tf.cast(tf.shape(transcripts)[0], tf.float32)
-    return dict(
-        image=ex["image"],
-        text=transcripts,
-        url=ex["url"],
-        text_weights=tf.fill((tf.shape(transcripts)[0],), weight),
-    )
-@seqio.map_over_dataset
-def extract_transcript(ex):
-    transcripts = tf.random.shuffle(ex["transcripts"])
-    return dict(
-        image=ex["image"],
-        text=transcripts[0],
-        url=ex["url"],
-    )
-@seqio.map_over_dataset
-def extract_caption(ex):
-    caption = ex["caption"]
-    if len(caption.shape) > 0:
-        ex["text"] = caption[0]
-    else:
-        ex["text"] = caption
-    return ex
-@seqio.map_over_dataset
-def extract_joint_captions(ex):
-    caption = ex["caption"]
-    if len(caption.shape) > 0:
-        caption = caption[0]
-    _ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
-    _ix = _ix % tf.shape(ex["transcripts"])[0]
-    return dict(
-        image=ex["image"],
-        text=tf.stack([caption, ex["mistral_caption"], ex["transcripts"][_ix]], 0),
-        url=ex["url"]
-    )
-@seqio.map_over_dataset(num_seeds=1)
-def extract_caption_and_transcript(ex, seed):
-    caption = ex["caption"]
-    if len(caption.shape) > 0:
-        caption = caption[0]
-    _ix = tf.random.stateless_uniform((), seed, 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
-    return dict(
-        image=ex["image"],
-        text=tf.stack([caption, ex["transcripts"][_ix]], 0),
-        url=ex["url"]
-    )
-@seqio.map_over_dataset
-def caption_transcript_augmented(ex, sequence_length):
-    caption = ex["caption"]
-    if len(caption.shape) > 0:
-        caption = caption[0]
-    image = ex["image"]
-    properties = []
-    do_augmentation = sequence_length["is_training"]
-    # do_augmentation = False
-    # Keep this off, it screws up OCR
-    # do_hflip = (tf.random.uniform(()) > 0.2 and do_augmentation)
-    do_hflip = False
-    if do_hflip:
-        image = image[:, ::-1]
-    # Mild color jitter
-    do_color = (tf.random.uniform(()) > 0.5 and do_augmentation)
-    if do_color:
-        image = tf.image.random_hue(image, max_delta=0.05)
-        image = tf.image.random_brightness(image, max_delta=0.2)
-        image = tf.image.random_saturation(image, 0.7, 1.3)
-        image = tf.image.random_contrast(image, 0.7, 1.3)
-    # Mild affine transformation
-    do_affine = (tf.random.uniform(()) > 0.5 and do_augmentation)
-    if do_affine and do_augmentation:
-        shift_x = tf.random.uniform((), -10, 10) * 0
-        shift_y = tf.random.uniform((), -10, 10) * 0
-        shear_x = tf.random.uniform((), -2, 2)
-        shear_y = tf.random.uniform((), -2, 2)
-        rotation = tf.random.uniform((), -6, 6)
-        max_scale = 1.1
-        scale = tf.random.uniform((), 0.8, max_scale)
-        center = tf.cast(tf.shape(image), tf.float32)/2
-        image = tf.keras.ops.image.affine_transform(
-            image,
-            tf.stack(get_affine_matrix(
-                [center[0], center[1]],
-                rotation,
-                [shift_x, shift_y],
-                1/scale,
-                [shear_x, shear_y]
-            ) + [0., 0.]),
-            interpolation='bilinear',
-            fill_mode='constant',
-            fill_value=1.,
-            data_format='channels_last'
-        )
-    properties = tf.stack([
-        ("[hflip]" if do_hflip else ""),
-        ("[color]" if do_color else ""),
-        ("[affine]" if do_affine else "")
-    ])
-    properties = tf.boolean_mask(properties, tf.strings.length(properties) > 0)
-    prompt = tf.strings.reduce_join(properties, separator=" ")
-    ix = tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)
-    out = dict(
-        image=image,
-        text=tf.stack([caption, ex["transcripts"][ix]], 0),
-        url=ex["url"],
-        prompt=prompt,
-    )
-    # out["metadata/unaugmented_image"] = image
-    return out
-def extract_caption_and_transcript_hflip(ds):
-    # Just in case they are ordered somehow in Matt's data
-    @seqio.map_over_dataset
-    def _shuffle_transcripts(_ex):
-        _ex["transcripts"] = tf.random.shuffle(_ex["transcripts"])
-        _ex["hflip"] = tf.random.uniform((), 0, 3, dtype=tf.int32)
-        return _ex
-    ds = _shuffle_transcripts(ds)
-    # Build a 3x long dataset with each individual transcript so we iterate through
-    # each transcript
-    @seqio.map_over_dataset
-    def _with_transcript(ex, _ix):
-        caption = ex["caption"]
-        if len(caption.shape) > 0:
-            caption = caption[0]
-        hflip = ex["hflip"] == _ix
-        if hflip:
-            ex["image"] = ex["image"][:, ::-1]
-            style = ["long_caption_flipped", "transcript_flipped"]
-        else:
-            style = ["long_caption", "transcript"]
-        return dict(
-            image=ex["image"],
-            text=tf.stack([caption, ex["transcripts"][_ix]], 0),
-            url=ex["url"],
-            style=style
-        )
-    joint_ds = _with_transcript(ds, 0)
-    for i in range(1, 3):
-        joint_ds = joint_ds.concatenate(_with_transcript(ds, i))
-    return joint_ds
-@seqio.map_over_dataset
-def extract_llava(ex, sequence_length, output_features):
-    tf.assert_equal(tf.shape(ex['conversations']['value'])[0], 2)
-    prompt = ex['conversations']['value'][0]
-    text = ex['conversations']['value'][1]
-    ex.pop('conversations')
-    ex["text"] = text
-    ex["prompt"] = prompt
-    return ex
-def extract_localized_narrative(ds):
-    ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)
-    def _map(ex):
-        return dict(
-            image=ex["image"],
-            text=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
-        )
-    return ds.map(_map)
-def float_to_text(val):
-    return tf.strings.as_string(tf.cast(val * 100, tf.int32))
-@seqio.map_over_dataset
-def extract_vqa(ex):
-    questions = ex["vqa"]["questions"]
-    answers = ex["vqa"]["answers"]
-    answers = tf.strings.reduce_join(answers, 1, separator="; ")
-    qas = tf.strings.reduce_join(tf.stack([questions, answers], 1), separator=" ")
-    return dict(
-        image=ex["image"],
-        text=tf.strings.reduce_join(qas, separator="\n")
-    )
-@seqio.map_over_dataset
-def coco_image_id_from_path(ex):
-    image_id = tf.strings.substr(ex["image/filename"], 0, tf.strings.length(ex["image/filename"])-4)
-    ex["image_id"] = tf.strings.to_number(image_id)
-    return ex
-@seqio.map_over_dataset
-def add_coco_url(ex):
-    """Turns a COCO path into a URL, which can then be used in visualizations"""
-    path = ex["image/filename"]
-    if not tf.strings.regex_full_match(path, ".*/.*"):
-        prefix = tf.strings.regex_replace(path, "COCO_", "")
-        prefix = tf.strings.regex_replace(prefix, "_[0-9]+.jpg", "")
-        path = tf.strings.join([prefix, path], separator="/")
-    # images are hosted by the COCO website here
-    url = tf.strings.join(["https://s3.us-east-1.amazonaws.com/images.cocodataset.org/", path])
-    ex["metadata/image_url"] = url
-    return ex
-def flatten_vqa(ds):
-    parts = ["questions", "answers"]
-    for k in ["id", "question_id"]:
-        if k in ds.element_spec:
-            parts.append(k)
-    return flatten_parts(ds, parts)
-def format_gqa(ds, is_balanced=True, flatten=True):
-    if is_balanced:
-        ds = ds.filter(lambda x: tf.reduce_any(x["questions"]["is_balanced"]))
-        def _filter_qs(ex):
-            qs = ex["questions"]
-            mask = qs["is_balanced"]
-            qs = {k: tf.boolean_mask(v, mask) for k, v in qs.items()}
-            ex["questions"] = qs
-            return ex
-        ds = ds.map(_filter_qs)
-    if flatten:
-        ds = flatten_parts(ds, ["questions"])
-    def _rename(ex):
-        out = ex["questions"]
-        out["image"] = ex["image"]
-        out["image_id"] = ex["image_id"]
-        return out
-    return ds.map(_rename)
-@seqio.map_over_dataset
-def fix_doqa_url(x):
-    x["image_url"] = tf.strings.regex_replace(x["image_url"], "gs://", "")
-    return x
-def _add_metadata(ex):
-    out = {}
-    if "id" in ex:
-        out["metadata/example_id"] = ex["id"]
-    elif "example_id" in ex:
-        out["metadata/example_id"] = ex["example_id"]
-    elif "question_id" in ex:
-        out["metadata/example_id"] = ex["question_id"]
-    if "image_url" in ex:
-        out["metadata/image_url"] = ex["image_url"]
-    for k, v in ex.items():
-        if k.startswith("metadata/"):
-            out[k] = v
-    return out
-def image_only(ds):
-    return ds.filter(lambda x: x["has_image"])
-def filter_difficult_direct_answer(ds):
-    return ds.filter(lambda x: not x["difficult_direct_answer"])
-@seqio.map_over_dataset()
-def format_ai2d(ex, variable_style=True):
-    abc = tf.constant(list("abcdefg".upper()))
-    out = dict(image=ex["image"])
-    out.update(_add_metadata(ex))
-    options = ex["choices"]
-    # >= 3 in case of none of the above like answers
-    n_options = tf.shape(ex["option_is_abc"])[0]
-    if ex["abc_label"] and tf.reduce_sum(tf.cast(ex["option_is_abc"], tf.int32)) >= (n_options - 1):
-        # The image labels are always upper, so use upper in the answer ptions
-        options = tf.where(
-            ex["option_is_abc"],
-            tf.strings.upper(options),
-            options
-        )
-        short_options = options
-        style = "ai2_diagram_no_letter"
-    else:
-        short_options = abc[:tf.shape(options)[0]]
-        options = tf.stack([short_options, options,], 1)
-        options = tf.strings.reduce_join(options, axis=-1, separator=": ")
-        style = "ai2_diagram"
-    options = tf.strings.reduce_join(options, separator="\n")
-    out["question"] = ex["question"]
-    out["options"] = options
-    if variable_style:
-        out["style"] = style
-    if ex["answer_idx"] < 0:
-        out["text"] = "?"
-    else:
-        out["text"] = short_options[ex["answer_idx"]]
-    out["metadata/answer_idx"] = ex["answer_idx"]
-    tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
-    out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
-    out["metadata/has_transparent_box"] = ex.get("has_transparent_box", tf.constant(False))
-    out["metadata/abc_label"] = ex["abc_label"]
-    return out
-@gin.configurable()
-@seqio.map_over_dataset()
-def format_multiple_choice_qa(ex, option_format="abc"):
-    assert option_format == "abc"
-    abc = tf.constant(list("abcdefg".upper()))
-    out = dict(image=ex["image"])
-    out.update(_add_metadata(ex))
-    options = ex["choices"]
-    short_options = abc[:tf.shape(options)[0]]
-    options = tf.stack([short_options, options,], 1)
-    options = tf.strings.reduce_join(options, axis=-1, separator=": ")
-    options = tf.strings.reduce_join(options, separator="\n")
-    out["question"] = ex["question"]
-    out["options"] = options
-    if ex["answer_idx"] < 0:
-        out["text"] = "?"
-    else:
-        out["text"] = short_options[ex["answer_idx"]]
-    out["metadata/answer_idx"] = ex["answer_idx"]
-    tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
-    out["metadata/option_names"] = tf.strings.reduce_join(short_options, separator="|||")
-    # out["metadata/option_names"] = tf.RaggedTensor.from_row_lengths(short_options, tf.shape(short_options))
-    # out["metadata/option_names"] = short_options
-    return out
-@seqio.map_over_dataset()
-def output_options(ex):
-    ex["metadata/options"] = ex["options"]
-    return ex
-@seqio.map_over_dataset()
-def extract_tally_qa(ex):
-    questions = ex.pop("questions")
-    ex["questions"] = questions["question"]
-    ex["answers"] = tf.strings.as_string(questions["answer"])
-    ex["question_id"] = questions["question_id"]
-    return ex
-@seqio.map_over_dataset()
-def count_bench_preprocessor(ex):
-    return {
-        "image": ex["image"],
-        "text": tf.strings.as_string(ex["number"]),
-        "object": ex["noun"],
-        "question": tf.strings.join([
-            "How many ", ex["noun"], " are there?"
-        ]),
-        "metadata/count": ex["number"],
-    }
-def filter_human(ds):
-    return ds.filter(lambda x: x["is_human"])
-def filter_aug(ds):
-    return ds.filter(lambda x: not x["is_human"])
-@seqio.map_over_dataset()
-def reweight_chartqa(ex, human, aug):
-    is_human = ex["metadata/is_human"]
-    ex["text_weights"] = human if is_human else aug
-    return ex
-@seqio.map_over_dataset()
-def chartqa_prompting(ex):
-    question = tf.strings.join([ex["question"], " Answer:"])
-    return dict(
-        image=ex["image"],
-        question=question,
-        answer=ex["answer"]
-    )
-@seqio.map_over_dataset()
-def chartqa_explanation(ex):
-    question = tf.strings.join([ex["question"], " Explanation:"])
-    out = {
-        "image": ex["image"],
-        "question": question,
-        "answer": ex["answer"],
-    }
-    out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
-    return out
-@seqio.map_over_dataset(num_seeds=1)
-def _preprocess_scifi(ex, seed):
-    if "qa_pairs" in ex:
-        q = ex["qa_pairs"]
-    else:
-        q = ex["qa"]
-    ix = stateless_permutation(tf.shape(q["question"])[0], seed)
-    return dict(
-        image=ex["image"],
-        question=tf.gather(q["question"], ix),
-        explanation=tf.gather(q["explanation"], ix),
-        answer=tf.gather(q["answer"], ix),
-    )
-@seqio.map_over_dataset
-def scifi_explanation_only(ex):
-    return dict(
-        image=ex["image"],
-        question=ex["question"],
-        answer=ex["explanation"],
-    )
-def filter_named_entity(ds):
-    @seqio.map_over_dataset
-    def _load_image(ex):
-        ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-        return ex
-    ds = _load_image(ds)
-    return ds.filter(lambda x: tf.reduce_min(tf.shape(x["image"])[:2]) >= 32)
-@seqio.map_over_dataset()
-def extract_named_entity(ex):
-    qs = ex["questions"]
-    return {
-        "image": ex["image"],
-        "metadata/image_url": ex["url"],
-        "metadata/entity": ex["entity"],
-        "questions": qs["question"],
-        "answers": qs["answer"],
-    }
-@gin.configurable()
-def extract_individual_vqa(ds, test=False, answer_mode="best"):
-    @seqio.map_over_dataset(num_seeds=1)
-    def _extract(ex, seed):
-        if "questions" in ex:
-            question = ex["questions"]
-        else:
-            question = ex["question"]
-        out = dict(
-            image=ex["image"],
-            question=question,
-        )
-        out.update(_add_metadata(ex))
-        out["metadata/question"] = question
-        if ex.get("answers") is not None:
-            out["metadata/references"] = tf.strings.reduce_join(ex["answers"], separator="\n")
-        elif ex.get("answer") is not None:
-            out["metadata/references"] = ex["answer"]
-        if not test:
-            if "answer" in ex:
-                answer = ex["answer"]
-            else:
-                answer = ex["answers"]
-            if answer.dtype in [tf.int32, tf.int64]:
-                answer = tf.strings.as_string(answer)
-            if len(answer.shape) == 1 and tf.shape(answer)[0] == 0:
-                answer = tf.expand_dims("", 0)
-            if len(answer.shape) == len(question.shape):
-                pass
-            # Handle questions with multiple answers
-            elif answer_mode == "random":
-                assert len(answer.shape) == 1
-                answer = answer[tf.random.stateless_uniform((), seed, 0, tf.shape(answer)[0], dtype=tf.int32)]
-            elif answer_mode == "best":
-                def _get_best(_answer):
-                    vals, _, counts = tf.unique_with_counts(_answer)
-                    count_thresh = tf.reduce_max(counts)
-                    vals = tf.boolean_mask(vals, counts >= count_thresh)
-                    return vals[tf.random.stateless_uniform((), seed, 0, tf.shape(vals)[0], dtype=tf.int32)]
-                if len(answer.shape) == 1:
-                    answer = _get_best(answer)
-                elif isinstance(answer, tf.RaggedTensor):
-                    n = tf.shape(answer)[0]
-                    answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
-                    for i in range(n):
-                        answer_arr = answer_arr.write(i, _get_best(answer[i]))
-                    answer = answer_arr.stack()
-                else:
-                    answer = tf.map_fn(_get_best, answer)
-            elif answer_mode == "all_segments":
-                out["text"] = answer
-            elif answer_mode == "all_segments_weighted":
-                out["text"] = answer
-                out["text_weights"] = 1.0 / tf.cast(tf.shape(answer)[-1], tf.float32)
-            elif answer_mode == "all":
-                if len(answer.shape) == 1:
-                    answer = stateless_shuffle(answer, seed)
-                    answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
-                elif isinstance(answer, tf.RaggedTensor):
-                    n = tf.shape(answer)[0]
-                    answer_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=())
-                    for i in range(n):
-                        answer_arr = answer_arr.write(i, tf.strings.reduce_join(tf.random.shuffle(answer[i]), separator="\n", axis=-1))
-                    answer = answer_arr.stack()
-                else:
-                    answer = tf.map_fn(tf.random.shuffle, answer)
-                    answer = tf.strings.reduce_join(answer, separator="\n", axis=-1)
-            else:
-                raise NotImplementedError()
-            out["text"] = answer
-        return out
-    return _extract(ds)
-@seqio.map_over_dataset()
-def extract_khan_academy(ex):
-    return dict(
-        image=ex["image"],
-        image_url=ex["image_url"],
-        prompt="Answer this question",
-        text=ex["gptResponse"]
-    )
-@seqio.map_over_dataset()
-def extract_vaia_qa_latex_image(ex, add_short_answer=False, set_short_answer_first=False):
-    if ex["has_image"]:
-        image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-        image = tf.expand_dims(image, 0)[:1]
-    else:
-        # image = get_blank_image() # blank image
-        image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-        image = tf.expand_dims(image, 0)[:0]
-    img_h = tf.shape(image)[1]
-    img_w = tf.shape(image)[2]
-    if add_short_answer:
-        if set_short_answer_first:
-            answer = tf.strings.join(["Answer: ", ex["short_answer"], "\n\n", ex["answer"]])
-        else:
-            answer = tf.strings.join([ex["answer"], "\n\n", "Answer: ", ex["short_answer"]])
-    else:
-        answer = ex["answer"]
-    out = dict(
-        image=image, # 4-d tensor
-        text=answer,
-        prompt=tf.strings.join([ex["latex_question"], "\n"]),
-    )
-    out["metadata/images"] = image
-    out.update(_add_metadata(ex))
-    out["metadata/batch_id"] = ex["batch_id"]
-    out["metadata/image_size"] = [img_w, img_h]
-    return out
-@seqio.map_over_dataset()
-def extract_vqa_online(ex):
-    out = dict(
-        image=ex["image"],
-        prompt=tf.strings.join([ex["question"], "\n"]),
-        text=ex["answer"]
-    )
-    out.update(_add_metadata(ex))
-    out["metadata/row_id"] = ex["row_id"]
-    return out
-@seqio.map_over_dataset()
-def extract_scifi_joint(ex):
-    if "qa_pairs" in ex:
-        q = ex["qa_pairs"]
-    else:
-        q = ex["qa"]
-    prompts = tf.concat([["Describe this image in detail."], q["question"]], 0)
-    responses = tf.concat([ex["summary"][None], q["answer"]], 0)
-    return dict(
-        image=ex["image"],
-        prompt=prompts,
-        text=responses,
-    )
-def remove_no_qa(ds):
-    def _filter(ex):
-        if "qa_pairs" in ex:
-            q = ex["qa_pairs"]
-        else:
-            q = ex["qa"]
-        return tf.shape(q["question"])[0] > 0
-    return ds.filter(_filter)
-@seqio.map_over_dataset()
-def extract_scifi_qa_exp(ex):
-    return dict(
-        image=ex["image"],
-        question=ex["question"],  # Array of questions
-        answer=tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]]),
-    )
-@seqio.map_over_dataset(num_seeds=1)
-def extract_scifi_qa_demo(ex, seed):
-    # if tf.random.stateless_uniform((), 0, 1) > 0.5:
-    answer = tf.strings.join([ex["explanation"], " Answer: ", ex["answer"]])
-    # else:
-    #     answer = ex["explanation"]
-    return dict(
-        image=ex["image"],
-        question=ex["question"],  # Array of questions
-        answer=answer,
-    )
-@seqio.map_over_dataset()
-def clock_bench_preprocessor(ex):
-    out = dict(
-        image=ex["image"],
-        prompt="What time is being shown?",
-    )
-    for k in ["hour", "minute", "second", "answerable"]:
-        out[f"metadata/{k}"] = ex[k]
-    return out
-def deg2rad(x):
-    return x*math.pi/180.0
-def get_affine_matrix(center, angle, translate, scale, shear):
-    # From https://github.com/pytorch/vision/blob/f96c42fca53230057b16941b078a0a9eee06e20f/torchvision/transforms/functional.py#L1006
-    rot = deg2rad(angle)
-    sx = deg2rad(shear[0])
-    sy = deg2rad(shear[1])
-    cx, cy = center
-    tx, ty = translate
-    # RSS without scaling
-    a = tf.cos(rot - sy) / tf.cos(sy)
-    b = -tf.cos(rot - sy) * tf.tan(sx) / tf.cos(sy) - tf.sin(rot)
-    c = tf.sin(rot - sy) / tf.cos(sy)
-    d = -tf.sin(rot - sy) * tf.tan(sx) / tf.cos(sy) + tf.cos(rot)
-    matrix = [a, b, 0.0, c, d, 0.0]
-    matrix = [x * scale for x in matrix]
-    # Apply inverse of center translation: RSS * C^-1
-    matrix[2] += matrix[0] * (-cx) + matrix[1] * (-cy)
-    matrix[5] += matrix[3] * (-cx) + matrix[4] * (-cy)
-    # Apply translation and center : T * C * RSS * C^-1
-    matrix[2] += cx + tx
-    matrix[5] += cy + ty
-    return matrix
-def quantize_point(coor, max_dim, mode="percent-precision-1"):
-    max_dim = tf.cast(max_dim, tf.float32)
-    coor = tf.cast(coor, tf.float32)
-    x = (coor / max_dim)
-    if mode == "percent-precision-1":
-        return tf.strings.as_string(x*100, precision=1)
-    elif mode == "zero_to_one":
-        return tf.strings.as_string(x, precision=3)
-    elif mode == "1k":
-        return tf.strings.as_string(x*1000, precision=0)
-    else:
-        raise NotImplementedError(mode)
-def construct_pointing_format(label_text, alt_text, x_str, y_str):
-    if alt_text is None:
-        alt_text = label_text
-    np = tf.shape(x_str)[0]
-    if np == 0:
-        output = ""
-    elif np == 1:
-        output = tf.strings.join([
-            '<point x="', x_str[0], '" y="', y_str[0], '" alt="',
-            alt_text, '">', label_text, '</point>'
-        ])
-    else:
-        ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
-        xs = tf.strings.join(["x", ids, '="', x_str, '"'])
-        ys = tf.strings.join(["y", ids, '="', y_str, '"'])
-        points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
-        output = tf.strings.join(
-            ["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
-    return output
-def order_points(x, y, seed, point_order):
-    if point_order == "natural":
-        return x, y
-    if point_order == "random":
-        ix = stateless_permutation(tf.shape(x)[0], seed)
-    elif point_order == "xy":
-        x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
-        ix = tf.argsort(x_float*100000 + y_float)
-    elif point_order == "yx":
-        x_float, y_float = tf.strings.to_number(x), tf.strings.to_number(y)
-        ix = tf.argsort(y_float*100000 + x_float)
-    else:
-        raise NotImplementedError(point_order)
-    return tf.gather(x, ix), tf.gather(y, ix)
-@gin.configurable()
-def points_to_text(x, y, w, h, seed, label=None, alt_text=None, point_mode="percent-precision-1",
-                   point_order="xy", point_list_mode="tag"):
-    """Returns a string encoding of a list of points"""
-    x = quantize_point(x, w, point_mode)
-    y = quantize_point(y, h, point_mode)
-    # Order the quantized points to make the order matches what was generated, this can matter
-    # when points have the same quantized value e.g, (10.001, 20) (10.002, 10) should be
-    # represented (10, 10), (10, 20), but if we sort before quantization we get (10, 20), (10, 10)
-    x, y = order_points(x, y, seed, point_order)
-    if point_list_mode == "tag":
-        return construct_pointing_format(label, alt_text, x, y)
-    elif point_list_mode == "paren":
-        n = tf.shape(x)[0]
-        return tf.strings.reduce_join(tf.strings.join([
-            "(", x, ", ", y, ")"
-        ]), separator=", ")
-        # if n == 0:
-        #     output = ""
-        # else:
-        #     ids = tf.strings.as_string(tf.range(1, np + 1, dtype=tf.int32))
-        #     xs = tf.strings.join(["x", ids, '="', x_str, '"'])
-        #     ys = tf.strings.join(["y", ids, '="', y_str, '"'])
-        #     points = tf.strings.reduce_join(tf.reshape(tf.stack([xs, ys], 1), [-1]), separator=' ', axis=-1)
-        #     output = tf.strings.join(
-        #         ["<points ", points, ' alt="', alt_text, '">', label_text, "</points>"])
-        # return output
-    else:
-        raise NotImplementedError(point_list_mode)
-def points_to_answer(x, y, w, h, seed, label, is_counting, alt_text=None):
-    count = tf.shape(x)[0]
-    if is_counting:
-        if count == 0:
-            return "There are none."
-        else:
-            point_text = points_to_text(x, y, w, h, seed, label, alt_text)
-            return tf.strings.join([
-                "Counting the ", point_text,
-                " shows a total of ",
-                tf.strings.as_string(count),
-                "."
-            ])
-    else:
-        if count == 0:
-            return "There are none."
-        else:
-            return points_to_text(x, y, w, h, seed, label, alt_text)
-@seqio.map_over_dataset(num_seeds=2)
-def extract_point_qa(ex, seeds, answer_type="y_major"):
-    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    img_h = tf.shape(ex["image"])[0]
-    img_w = tf.shape(ex["image"])[1]
-    questions = ex["questions"]
-    question = questions["question"]
-    n = tf.shape(question)[0]
-    answers = tf.TensorArray(tf.string, size=n, element_shape=())
-    point_text = questions["annotations"]["point_text"]
-    point_seeds = tf.RaggedTensor.from_row_splits(
-        row_splits=point_text.row_splits,
-        values=tf.random.split(seeds[0], num=tf.shape(point_text.values)[0])
-    )
-    for question_ix in range(n):
-        anno = questions["annotations"]
-        answer = questions["answer_with_placeholders"][question_ix]
-        n_anno = tf.shape(anno["point_text"][question_ix])[0]
-        for anno_ix in range(n_anno):
-            points = anno["points"][question_ix, anno_ix]
-            point_text = points_to_answer(
-                points[:, 0], points[:, 1], 100, 100,
-                point_seeds[question_ix, anno_ix],
-                anno["point_text"][question_ix, anno_ix],
-                False,
-                alt_text=anno["alt_text"][question_ix, anno_ix],
-            )
-            answer_split = tf.strings.split(answer, sep="<|POINT|>", maxsplit=1)
-            answer = tf.strings.join([answer_split[0], point_text, answer_split[1]])
-        # Make sure all placeholders where used
-        tf.debugging.assert_equal(tf.shape(tf.strings.split(answer, sep="<|POINT|>"))[0], 1)
-        answers = answers.write(question_ix, answer)
-    messages = tf.stack([question, answers.stack()], axis=1)
-    messages = tf.reshape(messages, [-1])
-    conversation_ids = tf.range(tf.shape(messages)[0] // 2, dtype=tf.int32)
-    conversation_ids = tf.repeat(conversation_ids, 2)
-    out = dict(
-        image=ex["image"],
-        messages=tf.RaggedTensor.from_value_rowids(messages, conversation_ids)
-    )
-    ix = stateless_permutation(tf.shape(messages)[0], seeds[1])
-    messages = tf.gather(messages, ix)
-    out.update(_add_metadata(ex))
-    out["metadata/image_size"] = [img_w, img_h]
-    return out
-def select_point(mask):
-    bs = tf.shape(mask)[0]
-    valid = tf.cast(mask, tf.float32)
-    h, w = tf.shape(mask)[1], tf.shape(mask)[2]
-    ys = tf.range(h, dtype=tf.int32)
-    xs = tf.range(w, dtype=tf.int32)
-    n = tf.reduce_sum(valid, [1, 2])
-    cy = tf.reduce_sum(tf.cast(ys[None, :, None], tf.float32) * valid, [1, 2]) / n  # [bs]
-    cx = tf.reduce_sum(tf.cast(xs[None, None, :], tf.float32) * valid, [1, 2]) / n  # [bs]
-    dist_y = tf.square(tf.range(h, dtype=tf.float32)[None, :] - cy[:, None])  # [bs, h]
-    dist_x = tf.square(tf.range(w, dtype=tf.float32)[None, :] - cx[:, None])  # [bs, w]
-    dist = dist_y[:, :, None] + dist_x[:, None, :]  # [batch, h, w]
-    dist = dist + (1 - valid) * 1e12
-    min_dist = tf.argmin(tf.reshape(dist, [bs, -1]), axis=-1)  # [batch]
-    w = tf.cast(w, min_dist.dtype)
-    cy = tf.cast(min_dist // w, tf.float32)
-    cx = tf.cast(min_dist % w, tf.float32)
-    return cx, cy
-@seqio.map_over_dataset
-def refexp_pointing(ex):
-    img_h = tf.shape(ex["image"])[0]
-    img_w = tf.shape(ex["image"])[1]
-    objects = ex["objects"]
-    # Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
-    refexps = objects['refexp']['raw']
-    bbox = objects["bbox"]
-    mask = tf.squeeze(objects["mask"], -1)
-    ix = tf.range(0, tf.shape(refexps)[0], dtype=tf.int32)
-    ix = tf.random.shuffle(ix)
-    refexps = tf.gather(refexps, ix)
-    bbox = tf.gather(bbox, ix)
-    mask = tf.gather(mask, ix)
-    cx, cy = select_point(mask)
-    answers = points_to_text(img_h, img_w, cx, cy)
-    out = {
-        "image": ex["image"],
-        "refexp": refexps.values,
-        "metadata/image_size": tf.stack([img_w, img_h,]),
-        "text": tf.repeat(answers, refexps.row_lengths()),
-    }
-    if "image_url" in ex:
-        out["metadata/image_url"] = ex["image_url"]
-    return out
-@seqio.map_over_dataset
-def refexp_pointing_inf(ex):
-    img_h = tf.shape(ex["image"])[0]
-    img_w = tf.shape(ex["image"])[1]
-    objects = ex["objects"]
-    mask = tf.squeeze(objects["mask"], -1)
-    cx, cy = select_point(mask)
-    answers = points_to_text(img_h, img_w, cx, cy)
-    refexps = objects["refexp"]["raw"]
-    # We can't use `mask` directly since it is variable size, and thus it
-    # will break batching. Here we serialize it instead
-    serialized_masks = tf.map_fn(tf.io.serialize_tensor, mask, fn_output_signature=tf.string)
-    out = {
-        "image": ex["image"],
-        "refexp": refexps,
-        "metadata/bbox": objects["bbox"],
-        "metadata/answer": answers,
-        "metadata/mask": serialized_masks,
-        "metadata/image_size": tf.stack([img_w, img_h]),
-    }
-    out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
-    return out
-@seqio.map_over_dataset
-def extract_andriod_control_inf(ex, mode):
-    if mode == "ll":
-        prompt = tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]])
-    elif mode == "hl_ll":
-        prompt = tf.strings.join([
-            "high_level: ", ex["metadata/hl_instruction"],
-            " low_level: ", ex["metadata/ll_instruction"]
-        ])
-    elif mode == "hl":
-        prompt = tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]])
-    elif mode == "hl_cot":
-        prompt = tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]])
-    else:
-        raise NotImplementedError()
-    out = dict(
-        image=ex["image"],
-        prompt=prompt,
-        text=ex["metadata/target_action"]
-    )
-    out.update(_add_metadata(ex))
-    return out
-@seqio.map_over_dataset
-def extract_android_control(ex):
-    # Each image has three tasks:
-    # low level -> action
-    # high+low level -> action
-    # high level -> action
-    # high level -> low level + action (CoT)
-    out = dict(
-        image=ex["image"],
-        prompt=tf.stack([
-            tf.strings.join(["low_level: ", ex["metadata/ll_instruction"]]),
-            tf.strings.join([
-                "high_level: ", ex["metadata/hl_instruction"],
-                " low_level: ", ex["metadata/ll_instruction"]
-            ]),
-            tf.strings.join(["high_level: ", ex["metadata/hl_instruction"]]),
-            tf.strings.join(["high_level_cot: ", ex["metadata/hl_instruction"]]),
-        ]),
-        text=tf.stack([
-            ex["metadata/target_action"],
-            ex["metadata/target_action"],
-            ex["metadata/target_action"],
-            tf.strings.join(["Plan: ", ex["metadata/ll_instruction"], " Action: ", ex["metadata/target_action"]]),
-        ])
-    )
-    # Only needed if visualizing
-    # ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    # img_h = tf.shape(ex["image"])[0]
-    # img_w = tf.shape(ex["image"])[1]
-    # out["metadata/image_size"] = tf.stack([img_w, img_h,])
-    out.update(_add_metadata(ex))
-    return out
-@seqio.map_over_dataset(num_seeds=1)
-def refexp(ex, seed):
-    img_h = tf.shape(ex["image"])[0]
-    img_w = tf.shape(ex["image"])[1]
-    objects = ex["objects"]
-    # Shuffle objects so what object gets truncated if the sequence gets truncated is randomized
-    refexps = objects['refexp']['raw']
-    bbox = objects["bbox"]
-    ix = stateless_permutation(tf.shape(refexps)[0], seed)
-    refexps = tf.gather(refexps, ix)
-    bbox = tf.gather(bbox, ix)
-    x2 = bbox[:, 0] + bbox[:, 2]
-    y2 = bbox[:, 1] + bbox[:, 3]
-    with tf.control_dependencies([
-        tf.debugging.assert_equal(tf.reduce_any(x2 <= tf.cast(img_w, tf.float32)), True),
-        tf.debugging.assert_equal(tf.reduce_any(y2 <= tf.cast(img_h, tf.float32)), True)
-    ]):
-        answers = points_to_text(
-            img_h, img_w,
-            tf.reshape(tf.stack([bbox[:, 0], x2], 1), [-1]),
-            tf.reshape(tf.stack([bbox[:, 1], y2], 1), [-1]))
-        answers = tf.strings.reduce_join(tf.reshape(answers, [-1, 2]), separator=" ", axis=1)
-    out = {
-        "image": ex["image"],
-        "refexp": refexps.values,
-        "metadata/bbox": bbox,
-        "metadata/image_size": tf.stack([img_w, img_h,]),
-        "text": tf.repeat(answers, refexps.row_lengths()),
-    }
-    if "image_url" in ex:
-        out["image_url"] = ex["image_url"]
-    return out
-@seqio.map_over_dataset
-def refexp_inf(ex):
-    img_h = tf.shape(ex["image"])[0]
-    img_w = tf.shape(ex["image"])[1]
-    out = {
-        "image": ex["image"],
-        "refexp": ex["objects"]["refexp"]["raw"],
-        "metadata/bbox": ex["objects"]["bbox"],
-        "metadata/image_size": tf.stack([img_w, img_h,]),
-    }
-    out.update({k: v for k, v in ex.items() if k.startswith("metadata/")})
-    return out
-def point_text_interleaved(*args):
-    raise NotImplementedError()
-@seqio.map_over_dataset
-def web_pointing_preprocessor(ex):
-    img_h = tf.shape(ex["image"])[0]
-    img_w = tf.shape(ex["image"])[1]
-    question = point_text_interleaved(
-        img_h, img_w, ex["question"], ex["question_points"]["x"], ex["question_points"]["y"])
-    answer = point_text_interleaved(
-        img_h, img_w, ex["answer"], ex["answer_points"]["x"], ex["answer_points"]["y"])
-    answer_points = tf.stack([ex["answer_points"]["x"], ex["answer_points"]["y"]], axis=1)
-    return {
-        "question": question,
-        "answer": answer,
-        "image": ex["image"],
-        "metadata/image_size": [img_w, img_h],
-        "metadata/question_type": ex["question_type"],
-        "metadata/answer_points": tf.io.serialize_tensor(answer_points),
-        "metadata/answer": answer,
-    }
-def filter_pointing(ds):
-    return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] >= 1)
-def filter_qa(ds):
-    return ds.filter(lambda ex: tf.shape(ex["answer_points"]["x"])[0] == 0)
-# vaia filtering
-def filter_image_only(ds):
-    return ds.filter(lambda ex: ex["has_image"])
-def filter_mc(ds):
-    return ds.filter(lambda ex: ex["is_mc"])
-def remove_is_long(ds):
-    return ds.filter(lambda ex: not ex["is_long"])
-def remove_has_multiple_parts(ds):
-    return ds.filter(lambda ex: not ex["has_multiple_parts"])
-def _split(ds: tf.data.Dataset, keys, n_splits=2):
-    def _map(ex):
-        n = tf.shape(ex[keys[0]])[0]
-        if n < n_splits:
-            return tf.data.Dataset.from_tensors(ex)
-        else:
-            # import pdb; pdb.set_trace()
-            bs = n // n_splits
-            remainder = n - bs*n_splits
-            lens = tf.concat([
-                tf.ones([remainder], dtype=tf.int32),
-                tf.zeros([n_splits-remainder], dtype=tf.int32),
-            ], axis=0) + bs
-            tf.debugging.assert_equal(tf.reduce_sum(lens), n)
-            ends = tf.cumsum(lens)
-            parts = []
-            for split_ix in range(n_splits):
-                part_ex = dict(ex)
-                e = ends[split_ix]
-                s = e - lens[split_ix]
-                for k in keys:
-                    if isinstance(k, tuple):
-                        assert len(k) == 2
-                        part_ex[k[0]][k[1]] = ex[k[0]][k[1]][s:e]
-                    else:
-                        part_ex[k] = ex[k][s:e]
-                parts.append(part_ex)
-            ds = tf.data.Dataset.from_tensors(parts[0])
-            for sub_ds in parts[1:]:
-                sub_ds = tf.data.Dataset.from_tensors(sub_ds)
-                ds = ds.concatenate(sub_ds)
-            return ds
-    return ds.flat_map(_map)
-def split(ds, n=2):
-    # return ds
-    return _split(ds, [k for k in [
-        "question",
-        "label",
-        "text",
-        "entity",
-        "messages"
-    ] if k in ds.element_spec], n_splits=n)
-def split_points(ds, max_points=50):
-    label = "question" if "question" in ds.element_spec else "label"
-    return _split(ds, [
-        "question", label, "notInImage",
-        ("answer_points", "x"),
-        ("answer_points", "y"),
-    ])
-@seqio.map_over_dataset
-def fix_count_qa(ex):
-    ex["label"] = ex["label"][::2]
-    tf.debugging.assert_equal(tf.shape(ex["answer_points"]["x"])[0], tf.shape(ex["label"])[0])
-    return ex
-def filter_points(ds, max_number=40):
-    def _add_valid(ex):
-        valid = (
-            tf.reduce_all(ex["answer_points"]["x"] >= 0.0, axis=-1) &
-            tf.reduce_all(ex["answer_points"]["x"] <= 100.0, axis=-1) &
-            tf.reduce_all(ex["answer_points"]["y"] >= 0.0, axis=-1) &
-            tf.reduce_all(ex["answer_points"]["y"] <= 100.0, axis=-1) &
-            (ex["answer_points"]["y"].row_lengths() <= max_number)
-        )
-        ex["valid"] = valid
-        return ex
-    ds = ds.map(_add_valid)
-    ds = ds.filter(lambda ex: tf.reduce_any(ex["valid"]))
-    return ds
-# def filter_points(ds, max_number=30):
-# n_points = ds["answer_points"]["x"].row_lengths()
-# parts = tf.TensorArray(tf.int32, size=tf.shape(n_points[0]), element_shape=tf.TensorShape([None]))
-# total = 0
-# on_row = 0
-# for i in range(n_points):
-#     n = n_points[i]
-#     if n > max_number:
-#         continue
-#     if n + total > max_number:
-#
-# return ds
-@seqio.map_over_dataset(num_seeds=2)
-def pointing_preprocessor(ex, sequence_length, seeds, with_count=False):
-    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    img_h = tf.shape(image)[0]
-    img_w = tf.shape(image)[1]
-    ix = tf.where(ex["valid"])[:, 0]
-    ix = stateless_shuffle(ix, seeds[0])
-    if "label" in ex:
-        question = tf.strings.lower(ex["label"])
-    else:
-        question = ex["question"]
-    question = tf.gather(question, ix)  # [n_question]
-    points_x = tf.gather(ex["answer_points"]["x"], ix)  # [n_question, n_points[ragged]]]
-    points_y = tf.gather(ex["answer_points"]["y"], ix)
-    not_in_image = tf.gather(ex["notInImage"], ix)  # [n_question]
-    n = tf.shape(points_x)[0]
-    point_text = tf.TensorArray(dtype=tf.string, size=n, element_shape=())  # [n_question]
-    point_seeds = tf.random.split(seeds[1], n)
-    for i in range(n):
-        answer = points_to_answer(points_x[i], points_y[i], 100, 100, point_seeds[i], question[i], with_count)
-        point_text = point_text.write(i, answer)
-    return {
-        "image": image,
-        "metadata/image_size": [img_w, img_h],
-        "entity": question,
-        "question": question,
-        "text": point_text.stack(),
-    }
-@seqio.map_over_dataset
-def pointing_inf_preprocessor(ex):
-    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    img_h = tf.shape(ex["image"])[0]
-    img_w = tf.shape(ex["image"])[1]
-    question = ex["question"]
-    not_in_image = tf.shape(ex["answer_points"]["x"])[0] == 0
-    # points are stored in normalized format, de-normalize here
-    points_x = ex["answer_points"]["x"] * tf.cast(img_w, tf.float32) / 100.0
-    points_y = ex["answer_points"]["y"] * tf.cast(img_h, tf.float32) / 100.0
-    out = dict(
-        image=ex["image"],
-        question=question,
-        entity=question,
-    )
-    out.update(_add_metadata(ex))
-    out["metadata/not_in_image"] = not_in_image
-    # We can't use `mask` directly since it is variable size, and thus it
-    # will break batching. Here we serialize it instead
-    serialized_masks = tf.map_fn(tf.io.serialize_tensor, ex["masks"], fn_output_signature=tf.string)
-    serialized_masks = tf.strings.reduce_join(serialized_masks, separator="|||")
-    out["metadata/mask"] = serialized_masks
-    out["metadata/question"] = question
-    out["metadata/answer_points"] = tf.io.serialize_tensor(tf.stack([points_x, points_y], 1))
-    out["metadata/image_size"] = [img_w, img_h]
-    return out
-@seqio.map_over_dataset(num_seeds=1)
-def count_qa_preprocessor_inf(ex, sequence_length, seed):
-    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    img_h = tf.shape(image)[0]
-    img_w = tf.shape(image)[1]
-    entity = tf.strings.substr(
-        ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
-    entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
-    entity = tf.strings.lower(entity)
-    tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)
-    return {
-        "image": image,
-        "metadata/image_size": [img_w, img_h],
-        "metadata/count": tf.strings.to_number(ex["answer"]),
-        "question": ex["question"],
-        "entity": entity,
-    }
-@seqio.map_over_dataset(num_seeds=1)
-def count_qa_preprocessor(ex, sequence_length, seed, with_count=False,
-                          for_inference=False):
-    point_answer = ex["point_answer"]
-    numbers_str = tf.strings.regex_replace(point_answer, r'\.$', '')
-    numbers_str = tf.strings.regex_replace(numbers_str, r'[^\d\.\s]+', '')
-    numbers_str = tf.strings.strip(numbers_str)
-    numbers = tf.strings.split(numbers_str)
-    float_numbers = tf.strings.to_number(numbers, out_type=tf.float32)
-    coordinates = tf.reshape(float_numbers, (-1, 3))
-    points_x = coordinates[:, 1]
-    points_y = coordinates[:, 2]
-    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    img_h = tf.shape(image)[0]
-    img_w = tf.shape(image)[1]
-    entity = tf.strings.substr(
-        ex["question"], len("How many "), tf.strings.length(ex["question"]) - len("How many "))
-    entity = tf.strings.split(entity, sep=" are ", maxsplit=1)[0]
-    entity = tf.strings.lower(entity)
-    tf.debugging.assert_equal(tf.strings.length(entity) != 0, True)
-    count = tf.strings.to_number(ex["answer"], out_type=tf.int32)
-    if for_inference:
-        return {
-            "image": image,
-            "metadata/image_size": [img_w, img_h],
-            "metadata/count": count,
-            "question": ex["question"],
-            "entity": entity,
-        }
-    else:
-        tf.debugging.assert_equal(count, tf.shape(points_x)[0])
-        # points are already normalized so use w=1, h=1
-        answer = points_to_answer(points_x, points_y, 1, 1, seed, entity, with_count)
-        return {
-            "image": image,
-            "metadata/image_size": [img_w, img_h],
-            "metadata/count": count,
-            "question": ex["question"],
-            "entity": entity,
-            "text": answer,
-        }
-@gin.configurable()
-@seqio.map_over_dataset
-def cleanup_preprocessor(ex, preprocess=False):
-    if preprocess:
-        ex["prompt"] = tf.strings.join(
-            [
-                "[[User]]: Correct the spelling and punctuation mistakes on the following transcript based on what appears in the image.\n\n{before} ",
-                ex["prompt"],
-                "\n[[Assistant]]: {after}"
-            ]
-        )
-        return ex
-    else:
-        return ex
-@gin.configurable()
-@seqio.map_over_dataset
-def random_text_preprocessor(ex, preprocess=False):
-    ex["prompt"] = "What does the text say in this image?"
-    if preprocess:
-        ex["prompt"] = tf.strings.join(["[[User]]: ", ex["prompt"], "\n[[Assistant]]:"])
-        return ex
-    else:
-        return ex
-@seqio.map_over_dataset(num_seeds=25)
-def clock_augmentation(ex, seeds):
-    seeds = list(seeds)
-    image = ex["image"]
-    # Apply shear, rotation, and scale through one affine matrix
-    height = tf.cast(tf.shape(image)[0], tf.float32)
-    width = tf.cast(tf.shape(image)[1], tf.float32)
-    _call_id = [0]
-    def _rng(_minval=0, _maxval=1, shape=(), dtype=tf.float32):
-        return tf.random.stateless_uniform(shape, seeds.pop(), _minval, _maxval, dtype=dtype)
-    sel = _rng(0, 1)
-    if sel < 0.1:
-        # Straight on
-        shear_x = 0.
-        shear_y = 0.
-        rotation = 0.
-    elif sel < 0.5:
-        # Normal looking
-        shear_x = _rng(-10, 10)
-        shear_y = _rng(-10, 10)
-        rotation = _rng(-25, 25)
-    else:
-        # Allowed to be very wonky
-        # if tf.random.stateless_uniform((), seeds.pop(), 0, 1) > 0.8:
-        #     image = image[:, ::-1]
-        if _rng() > 0.5:
-            shear_x = _rng( -30, 30)
-            shear_y = _rng( -30, 30)
-        else:
-            shear_x = _rng( -10, 10)
-            shear_y = _rng( -10, 10)
-        rng = _rng( 0, 1)
-        if rng < 0.2:
-            rotation = _rng( -25, 25)
-        elif rng < 0.6:
-            rotation = _rng( -80, 80)
-        else:
-            rotation = _rng( -180, 180)
-    if _rng() > 0.5:
-        scale = _rng( 0.3, 2)
-    else:
-        scale = _rng( 0.3, 1)
-    # Pad so upscaling/rotation will not move the image out of bounds
-    pad = tf.cast(tf.maximum(height, width)*0.5, tf.int32)
-    image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)
-    height = tf.cast(tf.shape(image)[0], tf.float32)
-    width = tf.cast(tf.shape(image)[1], tf.float32)
-    image = tf.keras.ops.image.affine_transform(
-        image,
-        tf.stack(get_affine_matrix(
-            [height/2, width/2],
-            rotation,
-            [0, 0],
-            1/scale,
-            [shear_x, shear_y]
-        ) + [0., 0.]),
-        interpolation='bilinear',
-        fill_mode='constant',
-        fill_value=1.,
-        data_format='channels_last'
-    )
-    # Crop, otherwise it would be impossible to put the image at the corner of the image
-    not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
-    no_white_ix = tf.where(not_white)
-    top_left = tf.reduce_min(no_white_ix, axis=0)
-    bottom_right = tf.reduce_max(no_white_ix, axis=0)
-    image = tf.image.crop_to_bounding_box(
-        image,
-        offset_height=tf.cast(top_left[0], tf.int32),
-        offset_width=tf.cast(top_left[1], tf.int32),
-        target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
-        target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
-    )
-    # Translate
-    height, width = tf.shape(image)[0], tf.shape(image)[1]
-    translation_seed = _rng(0, 1)
-    if translation_seed < 0.2:
-        h_pad = _rng(0, height//2, (2,), dtype=tf.int32)
-        w_pad = _rng(0, width//2, (2,), dtype=tf.int32)
-    else:
-        h_pad = _rng(0, height*2, (2,), dtype=tf.int32)
-        w_pad = _rng(0, width*2, (2,), dtype=tf.int32)
-    image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
-                   constant_values=1)
-    # Random background color
-    # color_rng = tf.random.stateless_uniform((4,), seeds.pop(), 0, 1)
-    # random_color = color_rng[:3]
-    # valid = tf.reduce_all(tf.reduce_sum(tf.abs(random_color[None, None, :] - image), -1) > 0.03)
-    # if color_rng[0] < 0.2 and valid:
-    #     image = tf.where(tf.reduce_all(image < 0.99, axis=-1, keepdims=True),
-    #                      image, image * 0 + random_color[None, None, :])
-    # Mild color hitter
-    image = tf.image.stateless_random_hue(image, max_delta=0.05, seed=seeds.pop())
-    image = tf.image.stateless_random_brightness(image, max_delta=0.15, seed=seeds.pop())
-    image = tf.image.stateless_random_saturation(image, 0.8, 1.2, seed=seeds.pop())
-    image = tf.image.stateless_random_contrast(image, 0.8, 1.2, seed=seeds.pop())
-    # ex["metadata/unaugmented_image"] = ex["image"]
-    ex["image"] = image
-    return ex
-@seqio.map_over_dataset
-def clocks_preprocessor(ex):
-    time_format = ex["time_format"]
-    shows_seconds = ex["shows_seconds"]
-    hour, minute, second = [tf.cast(ex[k], tf.int32) for k in ["hour", "minute", "second"]]
-    if hour == 0:  # Midnight of the previous day
-        am_pm = "PM"
-        hour_str = 12
-        hour = 24
-    elif hour > 12:
-        am_pm = "PM"
-        hour_str = hour - 12
-    else:
-        hour_str = hour
-        am_pm = "AM"
-    hour_str = tf.strings.as_string(hour_str)
-    minute_str = tf.strings.as_string(minute)
-    if tf.strings.length(minute_str) == 1:
-        minute_str = tf.strings.join(["0", minute_str])
-    second_str = tf.strings.as_string(second)
-    if tf.strings.length(second_str) == 1:
-        second_str = tf.strings.join(["0", second_str])
-    prefix = "The time shown is "
-    if time_format == "The time is not shown":
-        text = "The time is not shown in the image."
-        hour, minute, second = -1, -1, -1
-    else:
-        if not shows_seconds:
-            second = -1
-        if time_format == "12 hour clock (without AM/PM)" and shows_seconds:
-            if hour > 12:
-                hour = hour - 12
-            time = tf.strings.join([hour_str, ":", minute_str, ":", second_str])
-        elif time_format == "12 hour clock (with AM/PM)" and shows_seconds:
-            time = tf.strings.join([hour_str, ":", minute_str, ":", second_str, " ", am_pm])
-        elif time_format == "12 hour clock (with AM/PM)" and not shows_seconds:
-            time = tf.strings.join([hour_str, ":", minute_str, " ", am_pm])
-        elif time_format == "12 hour clock (without AM/PM)" and not shows_seconds:
-            if hour > 12:
-                hour = hour - 12
-            time = tf.strings.join([hour_str, ":", minute_str])
-        else:
-            time = ""  # Should never occur, but needed for tf analysis
-        tf.debugging.assert_equal(tf.strings.length(time) > 0, True)
-        text = tf.strings.join(["The time shown is ", time])
-    image = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    image = tf.image.convert_image_dtype(image, tf.float32)[:-120]  # remove the black shadow at the bottom
-    return {
-        "image": image,
-        "prompt": "What time is being shown?",
-        "text": text,
-        "metadata/time_format": time_format,
-        "metadata/hour": hour,
-        "metadata/minute": minute,
-        "metadata/text": text,
-        "metadata/second": second,
-    }
-@seqio.map_over_dataset()
-def atlas_obscura_preprocessor(ex):
-    out = dict(
-        image=ex["image"],
-        prompt="Where was this picture taken?",
-        text=tf.strings.join([
-            ex["place"],
-            " in ",
-            ex["city"]
-        ])
-    )
-    out["metadata/image_url"] = ex["image_url"]
-    out["metadata/references"] = out["text"]
-    return out
-@seqio.map_over_dataset()
-def famous_birthdays_preprocessor(ex):
-    out = dict(
-        image=ex["image"],
-        image_url=ex["image_url"],
-        prompt="Who is this?",
-        text=ex["name"]
-    )
-    out["metadata/references"] = out["text"]
-    return out
-@seqio.map_over_dataset()
-def mild_color_aug_preprocessor(ex):
-    if "image_url" in ex:  # URL won't show the augmentations
-        del ex["image_url"]
-    # ex["metadata/unaugmented_image"] = ex["image"]
-    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    ex["image"] = mild_color_aug(ex["image"])
-    return ex
-def build_text_with_points(text, points, img_h, img_w):
-    points = points_to_text(img_h, img_w, points[:, 0], points[:, 1])
-    parts = tf.strings.split(text, sep="<ANS>")
-    with_points = tf.strings.reduce_join(tf.reshape(tf.stack([
-        parts,
-        tf.pad(points, [[0, 1]], constant_values=""),
-    ], 1), [-1]), separator="")
-    return tf.strings.split(with_points, "\n\n")
-@seqio.map_over_dataset()
-def synth_count_preprocessor(example):
-    image_shape = tf.shape(example["image"])
-    h, w = image_shape[0], image_shape[1]
-    questions = build_text_with_points(example["questions"], example["question_points"], h, w)
-    answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
-    keep_q = tf.strings.regex_full_match(questions, "How many.*")
-    keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
-    keep = tf.logical_and(keep_q, keep_ans)
-    questions = tf.boolean_mask(questions, keep)
-    answers = tf.boolean_mask(answers, keep)
-    ix = tf.range(0, tf.shape(answers)[0], dtype=tf.int32)
-    ix = tf.random.shuffle(ix)
-    return dict(
-        image=example["image"],
-        prompt=tf.gather(questions, ix),
-        text=tf.gather(answers, ix),
-    )
-def synth_count_inf_preprocessor(ds):
-    @seqio.map_over_dataset(num_seeds=1)
-    def get_two(example, seed):
-        image_shape = tf.shape(example["image"])
-        h, w = image_shape[0], image_shape[1]
-        questions = build_text_with_points(example["questions"], example["question_points"], h, w)
-        answers = build_text_with_points(example["answers"], example["answer_points"], h, w)
-        keep_q = tf.strings.regex_full_match(questions, "How many.*")
-        keep_ans = tf.strings.regex_full_match(answers, "There are [0-9]+.*")
-        keep = tf.logical_and(keep_q, keep_ans)
-        questions = tf.boolean_mask(questions, keep)
-        answers = tf.boolean_mask(answers, keep)
-        ix = stateless_permutation(tf.shape(answers)[0], seed)[:2]
-        return {
-            "image": example["image"],
-            "prompt": tf.gather(questions, ix),
-            "metadata/references": tf.gather(answers, ix),
-        }
-    ds = get_two(ds)
-    return flatten_parts(ds, ["prompt", "metadata/references"])
-def mild_color_aug(image):
-    image = tf.image.random_hue(image, max_delta=0.05)
-    image = tf.image.random_brightness(image, max_delta=0.15)
-    image = tf.image.random_saturation(image, 0.7, 1.3)
-    image = tf.image.random_contrast(image, 0.8, 1.2)
-    return image
-@seqio.map_over_dataset()
-def name_entity_augmentation(ex, p_high_color=0.7):
-    ex["image"] = tf.image.decode_image(ex['image'], channels=3, expand_animations=False)
-    image = ex["image"]
-    image = tf.image.convert_image_dtype(image, tf.float32)
-    # Horizontal flip
-    if tf.random.uniform((), 0, 1) > 0.85:
-        image = image[:, ::-1]
-    # Random crop
-    height = tf.cast(tf.shape(image)[0], tf.float32)
-    width = tf.cast(tf.shape(image)[1], tf.float32)
-    crop_rng = tf.random.uniform((), 0, 1)
-    if crop_rng < 0.2:
-        pass
-    else:
-        if crop_rng < 0.4:
-            h_crop = height * 0.15
-            w_crop = width * 0.15
-        else:
-            h_crop = height * 0.4
-            w_crop = width * 0.4
-        crop_h = tf.cast(tf.random.uniform((2,), 0, h_crop/2), tf.int32)
-        crop_w = tf.cast(tf.random.uniform((2,), 0, w_crop/2), tf.int32)
-        image = image[crop_h[0]:-crop_h[1]-1, crop_w[0]:-crop_w[1]-1]
-        height = tf.cast(tf.shape(image)[0], tf.float32)
-        width = tf.cast(tf.shape(image)[1], tf.float32)
-    if tf.random.uniform(()) > p_high_color:
-        image = tf.image.random_hue(image, max_delta=0.05)
-        image = tf.image.random_brightness(image, max_delta=0.15)
-        image = tf.image.random_saturation(image, 0.7, 1.3)
-        image = tf.image.random_contrast(image, 0.8, 1.2)
-    else:
-        image = tf.image.random_hue(image, max_delta=0.1)
-        image = tf.image.random_brightness(image, max_delta=0.3)
-        image = tf.image.random_saturation(image, 0.0, 2.0)
-        image = tf.image.random_contrast(image, 0.2, 1.5)
-    # Apply shear, rotation, and scale through one affine matrix
-    sel = tf.random.uniform((), 0, 1)
-    if sel < 0.1:
-        pass
-    else:
-        if sel < 0.15:  # Scale only
-            shear_x = 0
-            shear_y = 0
-            rotation = 0
-        if sel < 0.7:  # Mild
-            shear_x = tf.random.uniform((), -2, 2)
-            shear_y = tf.random.uniform((), -2, 2)
-            rotation = tf.random.uniform((), -5, 5)
-        else:  # Severe
-            shear_x = tf.random.uniform((), -10, 10)
-            shear_y = tf.random.uniform((), -10, 10)
-            rotation = tf.random.uniform((), -20, 20)
-        max_scale = 1.2
-        scale = tf.random.uniform((), 0.4, max_scale)
-        # Pad so upscaling/rotation will not move the image out of bounds
-        pad = tf.cast(tf.maximum(height, width)*0.2, tf.int32)
-        image = tf.pad(image, [[pad, pad], [pad, pad], [0, 0]], constant_values=1)
-        image = tf.keras.ops.image.affine_transform(
-            image,
-            tf.stack(get_affine_matrix(
-                [height/2, width/2],
-                rotation,
-                [0, 0],
-                1/scale,
-                [shear_x, shear_y]
-            ) + [0., 0.]),
-            interpolation='bilinear',
-            fill_mode='constant',
-            fill_value=1.,
-            data_format='channels_last'
-        )
-    # Crop, otherwise it would be impossible to put the image at the corner of the image
-    not_white = tf.logical_not(tf.reduce_all(image > 0.99, -1))
-    no_white_ix = tf.where(not_white)
-    top_left = tf.reduce_min(no_white_ix, axis=0)
-    bottom_right = tf.reduce_max(no_white_ix, axis=0)
-    # Very low chance center crop will get nothing but white space, we just skip
-    if (
-        (bottom_right[0] - top_left[0]) > 1 and (bottom_right[1] - top_left[1]) > 1
-    ):
-        image = tf.image.crop_to_bounding_box(
-            image,
-            offset_height=tf.cast(top_left[0], tf.int32),
-            offset_width=tf.cast(top_left[1], tf.int32),
-            target_height=tf.cast(bottom_right[0] - top_left[0] + 1, tf.int32),
-            target_width=tf.cast(bottom_right[1] - top_left[1] + 1, tf.int32),
-        )
-    # Translate
-    height, width = tf.shape(image)[0], tf.shape(image)[1]
-    if tf.random.uniform((), 0, 1) < 0.1:
-        h_pad = tf.zeros((2,), dtype=tf.int32)
-        w_pad = tf.zeros((2,), dtype=tf.int32)
-    elif tf.random.uniform((), 0, 1) < 0.8:
-        h_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
-        w_pad = tf.random.uniform((2,), 0, 50, dtype=tf.int32)
-    else:
-        pad = tf.cast(tf.maximum(height, width), tf.int32)
-        h_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
-        w_pad = tf.random.uniform((2,), 0, pad, dtype=tf.int32)
-    image = tf.pad(image, [[h_pad[0], w_pad[0]], [h_pad[1], w_pad[1]], [0, 0]],
-                   constant_values=1)
-    if "image_url" in ex:  # URL won't show the augmentations
-        del ex["image_url"]
-    # ex["metadata/unaugmented_image"] = ex["image"]
-    ex["image"] = image
-    return ex
-@seqio.map_over_dataset()
-def wiki_art_preprocessor(ex):
-    out = dict(
-        image=ex["image"],
-        prompt="What is this?",
-        text=ex["question"]
-    )
-    out["metadata/title"] = ex["title"]
-    out["metadata/gt"] = ex["question"]
-    out["metadata/artist"] = ex["artist"]
-    out["metadata/painting_url"] = ex["painting_url"]
-    # if "metadata/unaugmented_image" in ex:
-    #     out["metadata/unaugmented_image"] = ex["metadata/unaugmented_image"]
-    return out
-@seqio.map_over_dataset()
-def oscar_preprocessor(ex):
-    out = dict(
-        image=ex["image"],
-        prompt=ex["question"]
-    )
-    out.update(_add_metadata(ex))
-    out["metadata/question"] = ex["question"]
-    out["metadata/answer"] = ex["answer"]
-    out["metadata/category"] = ex["category"]
-    return out
-@seqio.map_over_dataset()
-def tulu_preprocessor(ex):
-    return {
-        "messages": ex["messages"]["content"],
-    }
-    # logging.info("Debugging tulue")
-    # return {"messages": ex["messages"]["content"], "text_weights": 1e-6}
-WIKI_DATA_QUESTION = "What is this? Respond with just a proper name."
-@seqio.map_over_dataset()
-def extract_wiki_data(ex):
-    return dict(
-        image=ex["image"],
-        image_url=ex["image_url"],
-        prompt=[
-            WIKI_DATA_QUESTION,
-            "What is this? Respond with the proper name of the main focus of the image and a few details about it."
-        ],
-        text=[
-            tf.strings.strip(tf.strings.regex_replace(ex["question"], r"\(.*\)", "")),
-            ex["gptResponse"],
-        ]
-    )
-@seqio.map_over_dataset()
-def extract_wiki_data_name(ex):
-    target = tf.strings.strip(tf.strings.regex_replace(ex["question"], r"\(.*\)", ""))
-    out = dict(
-        image=ex["image"],
-        image_url=ex["image_url"],
-        prompt=WIKI_DATA_QUESTION,
-        text=target,
-    )
-    out["metadata/references"] = target
-    return out
-@seqio.map_over_dataset()
-def extract_wiki_data_describe(ex):
-    out = dict(
-        image=ex["image"],
-        image_url=ex["image_url"],
-        prompt="What is this? Respond with the proper name of the main focus of the image and a few details about it.",
-    )
-    out["metadata/references"] = ex["gptResponse"]
-    return out
-@gin.configurable()
-def format_multiple_style_qa(ds, types=['multiple_choice', 'short_answer'], styles=['ai2_diagram', 'vqa2'], default_style='vqa2',
-                             strip_instruction=False):
-    def _extract(ex):
-        prompt = ex["question"]
-        out = dict(image=ex["image"])
-        out.update(_add_metadata(ex))
-        out["text"] = ex["answer"]
-        out["metadata/references"] = ex["answer"]
-        if ex["metadata/question_type"] == 'multiple_choice':
-            style = styles[0]
-        else:
-            style = styles[1]
-        if strip_instruction:
-            if ex["metadata/question_type"] == "multiple_choice":
-                # parts = tf.strings.split(prompt, "\n")
-                # parts 1 is blank and part -1 is the instruction
-                # prompt = tf.strings.reduce_join(tf.concat([parts[:1], parts[2:-1]], 0), separator="\n")
-                prompt = prompt
-            else:
-                prompt = tf.strings.split(prompt, "\n")[0]
-        out["style"] = style
-        out["prompt"] = prompt
-        return out
-    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return ds
-@gin.configurable()
-def extract_mmmu(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
-    assert option_format == "abc"
-    keys_tensor = tf.constant(types, dtype=tf.string)
-    values_tensor = tf.constant(styles, dtype=tf.string)
-    table = tf.lookup.StaticHashTable(
-        tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
-        default_value=tf.constant(default_style, dtype=tf.string),
-    )
-    def _extract(ex):
-        out = dict(image=tf.expand_dims(ex["image_1"], 0))
-        out.update(_add_metadata(ex))
-        style = table.lookup(ex["metadata/question_type"])
-        out["style"] = style
-        out["text"] = ex["answer"]
-        out["metadata/references"] = ex["answer"]
-        if style == styles[0]:
-            abc = tf.constant(list("abcdefghi".upper()))
-            options = ex["options"]
-            num_options = tf.shape(options)[0]
-            dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
-            out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
-            out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
-            short_options = abc[:num_options]
-            options = tf.stack([short_options, options,], 1)
-            options = tf.strings.reduce_join(options, axis=-1, separator=": ")
-            options = tf.strings.reduce_join(options, separator="\n")
-            out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
-            if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
-                # Following LLaVa, don't use any images if there are multiple images paths
-                # I think the rationale is that this means the image are answer-options
-                out["image"] = out["image"][:0]
-        else:
-            out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
-            out["prompt"] = ex["question"]
-            out["image"] = out["image"][:0]
-        return out
-    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return ds
-@gin.configurable()
-def extract_mmmu_cot(ds, types=['multiple-choice', 'open'], styles=['ai2_diagram', 'vqa2'], default_style='ai2_diagram', option_format="abc"):
-    assert option_format == "abc"
-    keys_tensor = tf.constant(types, dtype=tf.string)
-    values_tensor = tf.constant(styles, dtype=tf.string)
-    table = tf.lookup.StaticHashTable(
-        tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
-        default_value=tf.constant(default_style, dtype=tf.string),
-    )
-    def _extract(ex):
-        # out = dict(image=tf.expand_dims(ex["image_with_question"], 0))
-        out = dict(image=tf.expand_dims(ex["image_1"], 0))
-        out.update(_add_metadata(ex))
-        style = table.lookup(ex["metadata/question_type"])
-        # out["style"] = style
-        out["text"] = ex["answer"]
-        out["metadata/question"] = ex["question"]
-        out["metadata/references"] = ex["answer"]
-        if style == styles[0]:
-            abc = tf.constant(list("abcdefghi".upper()))
-            options = ex["options"]
-            num_options = tf.shape(options)[0]
-            dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
-            out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
-            out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
-            short_options = abc[:num_options]
-            options = tf.stack([short_options, options,], 1)
-            options = tf.strings.reduce_join(options, axis=-1, separator=": ")
-            options = tf.strings.reduce_join(options, separator="\n")
-            out["prompt"] = tf.strings.join([ex["question"], "\n", options, "\n"])
-            # out["prompt"] = ex["question"]
-            if tf.reduce_sum(tf.cast(tf.strings.regex_full_match(options, "<img='(.*?)'>"), tf.int32)) > 1:
-                # Following LLaVa, don't use any images if there are multiple images paths
-                # I think the rationale is that this means the image are answer-options
-                out["image"] = out["image"][:0]
-        else:
-            out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
-            out["prompt"] = ex["question"]
-            # out["image"] = out["image"][:0]
-        return out
-    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return ds
-@seqio.map_over_dataset
-def reformat_math_vista(ex):
-    query = ex["query"]
-    query = tf.strings.split(query, sep="Question:")[-1]
-    query = tf.strings.strip(tf.strings.split(query, sep="Hint:")[0])
-    ex["query"] = query
-    return ex
-@seqio.map_over_dataset
-def extract_math_vista(ex, styles=['ai2_diagram', 'vqa2']):
-    out = dict(image=ex["image"])
-    out.update(_add_metadata(ex))
-    is_mc = ex["metadata/question_type"] == 'multi_choice'
-    if is_mc:
-        style = styles[0]
-        abc = tf.constant(list("abcdefghi".upper()))
-        options = ex["choices"]
-        num_options = tf.shape(options)[0]
-        dummy_options = tf.tile(tf.constant([""], dtype=tf.string), [9 - num_options])
-        out["metadata/options"] = tf.concat([options, dummy_options], axis=0)
-        out["metadata/options"] = tf.ensure_shape(out["metadata/options"], [9])
-        if ex["metadata/split"] != "test":
-            short_options = abc[:num_options]
-            answer_short_option = tf.boolean_mask(short_options, options == ex["answer"])[0]
-            out["text"] = answer_short_option
-        else:
-            out["text"] = ex["answer"]
-    else:
-        style = styles[1]
-        out["metadata/options"] = tf.constant([""] * 9, dtype=tf.string)
-        out["text"] = ex["answer"]
-    out["style"] = style
-    out["prompt"] = ex["query"]
-    out["metadata/query"] = ex["query"]
-    out["metadata/references"] = ex["answer"]
-    return out
-NO_POINT_PREFIX = [
-    "No pointing: ",
-    "No pointing: ",
-    "no pointing:\n",
-    "No pointing:\n",
-    "Not pointing:\n",
-    "No Points: ",
-    "No Points: ",
-    "NO POINTING\n",
-    "No pontiing\n",
-    "No Points:\n ",
-    "No pointing\n",
-    "Do not point. ",
-    "Refrain from pointing. ",
-    "Avoid generating points . ",
-    "For this question, do not use points. ",
-    "Refrain from using points:\n",
-    "Don't include points in your response. ",
-    "Don't point. ",
-    "Don't use points. ",
-    "Please don't use points.\n\n",
-    "Please don't use points.\n\n",
-    "Respond without using points. ",
-    "Respond without pointing:\n",
-    "Do not generate ponits: ",
-    "Do not point. ",
-    "Do not point\n",
-    "no pointing\n\n",
-    "Answer without points: ",
-    "Answer this question without pointing: ",
-    "Answer without poiints. ",
-    "answer without points: ",
-    "answer with text only, do not points\n"
-]
-assert all(x[-1].isspace() for x in NO_POINT_PREFIX)
-NO_POINT_PREFIX_TF = tf.constant(NO_POINT_PREFIX)
-def prefix_how_many(messages, seed):
-    question = messages[0]
-    if tf.strings.regex_full_match(tf.strings.lower(question), "how many.*"):
-        ix = tf.random.stateless_uniform((), seed, 0, len(NO_POINT_PREFIX),  tf.int32)
-        question = tf.strings.join([NO_POINT_PREFIX_TF[ix], question])
-        return tf.concat([tf.expand_dims(question, 0), messages[1:]], axis=0)
-    else:
-        return messages
-@seqio.map_over_dataset(num_seeds=1)
-def prefix_how_many_messages(ex, seed):
-    messages = ex["messages"]
-    n = tf.shape(messages)[0]
-    seeds = tf.random.split(seed, n)
-    message_arr = tf.TensorArray(dtype=tf.string, size=n, element_shape=(None,))
-    for i in range(n):
-        message_arr = message_arr.write(i, prefix_how_many(messages[i], seeds[i]))
-    ex["messages"] = tf.RaggedTensor.from_row_splits(
-        values=message_arr.concat(), row_splits=messages.row_splits)
-    return ex
-def filter_single_turn(ds):
-    @seqio.map_over_dataset
-    def _filter(ex):
-        multi_turn = ex["messages"].row_lengths() > 2
-        ex["messages"] = tf.ragged.boolean_mask(ex["messages"], multi_turn)
-        return ex
-    ds = _filter(ds)
-    ds = ds.filter(lambda x: tf.shape(x["messages"])[0] > 0)
-    return ds
-@seqio.map_over_dataset(num_seeds=1)
-def extract_cockatoo_qa_v2(ex, seed):
-    messages = tf.RaggedTensor.from_value_rowids(ex["messages"], ex["conversation_ids"])
-    ix = stateless_permutation(tf.shape(messages)[0], seed)
-    messages = tf.gather(messages, ix)
-    out = dict(
-        image=ex["image"],
-        messages=messages
-    )
-    out.update(_add_metadata(ex))
-    return out
-def format_mmbench(ds):
-    def _trim(ex):
-        num_passes = tf.shape(ex["id"])[0]
-        ex["choices"] = ex["choices"][:num_passes, :num_passes]
-        ex["answer"] = ex["answer"][:num_passes]
-        return ex
-    ds = ds.map(_trim)
-    ds = flatten_parts(ds, ["id", "query", "choices", "answer"])
-    def _extract(ex):
-        out = dict(image=ex["image"])
-        out.update(_add_metadata(ex))
-        out["prompt"] = ex["query"]
-        out["text"] = ex["answer"]
-        options = ex["choices"]
-        tf.debugging.assert_equal(tf.reduce_any(tf.strings.regex_full_match(options, ".*\|\|\|.*")), False)
-        out["metadata/options"] = tf.strings.reduce_join(options, separator="|||")
-        out["metadata/question"] = ex["question"]
-        out["metadata/references"] = ex["answer"]
-        return out
-    ds = ds.map(_extract, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return ds
-@seqio.map_over_dataset
-def extract_lvis(ex, class_name_file="gs://oe-training-chrisc/cockatoo/data/lvis_class_names.json"):
-    with tf.io.gfile.GFile(class_name_file) as f:
-        class_names = json.load(f)
-    class_names_arr = [None]*len(class_names)
-    for k, v in class_names.items():
-        class_names_arr[int(k)] = v
-    assert all(x is not None for x in class_names_arr)
-    class_names_arr = tf.constant(class_names_arr)
-    return dict(
-        image=ex["image"],
-        bbox=ex["objects"]["bbox"],
-        label=tf.gather(class_names_arr, ex["objects"]["label"]),
-    )
-def extract_open_images_boxes(ds):
-    # ds = ds.filter(lambda ex: tf.logical_or(
-    #     tf.shape(ex["cap/cap_caption"])[0] > 0,
-    # tf.shape(ex["detection/bbox"])[0] > 0
-    # ))
-    ds = ds.filter(lambda ex: tf.shape(ex["cap/cap_caption"])[0] > 0)
-    @seqio.map_over_dataset
-    def _map(ex):
-        bbox = tf.reshape(ex["detection/bbox"], (-1, 4))
-        bbox = tf.stack([
-            bbox[:, 2],
-            bbox[:, 0],
-            bbox[:, 3],
-            bbox[:, 1]
-        ], 1)
-        return dict(
-            image=tf.image.decode_jpeg(ex["image"]),
-            bbox=bbox,
-            label=ex["detection/label"],
-            caption=tf.strings.reduce_join(ex["cap/cap_caption"], separator="\n")
-        )
-    return _map(ds)
-@seqio.map_over_dataset
-def region_captions_to_dense(ex):
-    if "captions" in ex:
-        captions = ex["captions"]["text"]
-        boxes = ex["captions"]["bbox"]
-    else:
-        captions = ex["label"]
-        boxes = ex["bbox"]
-    sh = tf.cast(tf.shape(ex["image"])[:2], tf.float32)
-    # image_h, image_w = sh[0], sh[1]
-    w = boxes[:, 2] - boxes[:, 0]
-    h = boxes[:, 3] - boxes[:, 1]
-    cx = tf.cast(boxes[:, 0] + w/2, tf.float32)
-    cy = tf.cast(boxes[:, 1] + h/2, tf.float32)
-    # w = w / image_w
-    # h = h / image_h
-    coor = tf.strings.reduce_join(
-        float_to_text(tf.stack([cx, cy, w, h], 1)), separator=",", axis=1)
-    area = w*h
-    if tf.random.uniform(()) < 0.5:
-        coor_text = "before"
-        captions = tf.strings.join([coor, captions], separator=": ")
-    else:
-        coor_text = "after"
-        captions = tf.strings.join([captions, coor], separator=": ")
-    ix = tf.random.uniform((), 0, 6, tf.int32)
-    center = boxes
-    if ix == 0:
-        order_text = "left"
-        sort_by = boxes[:, 0]
-    elif ix == 1:
-        order_text = "right"
-        sort_by = -boxes[:, 2]
-    elif ix == 2:
-        order_text = "top"
-        sort_by = boxes[:, 1]
-    elif ix == 3:
-        order_text = "bottom"
-        sort_by = -boxes[:, 3]
-    elif ix == 4:
-        order_text = "largest"
-        sort_by = area
-    else:
-        order_text = "smallest"
-        sort_by = -area
-    ixs = tf.argsort(sort_by)
-    captions = tf.gather(captions, ixs)
-    text = tf.strings.join([
-        order_text,
-        coor_text,
-        tf.strings.reduce_join(captions, separator="\n")
-    ], separator="; ")
-    if "caption" in ex:
-        if tf.random.uniform(()) > 0.5:
-            text = tf.strings.join([text, "\ncaption: ", ex["caption"]])
-        else:
-            text = tf.strings.join(["caption: ", ex["caption"], "\n", text])
-    return dict(
-        image=ex["image"],
-        text=text
-    )
-@seqio.map_over_dataset()
-def join_captions(ex):
-    text = tf.random.shuffle(ex['text'])
-    ex["text"] = tf.strings.reduce_join(text, separator="\n")
-    return ex
-@seqio.map_over_dataset(num_seeds=1)
-def extract_figureqa(ex, seed):
-    questions = ex["questions"]
-    n = stateless_permutation(tf.shape(questions["question"])[0], seed)
-    return dict(
-        image=ex["image"],
-        questions=tf.gather(questions["question"], n),
-        question_id=tf.gather(questions["question_id"], n),
-        answer=tf.gather(tf.strings.as_string(questions["answer"]), n)
-    )
-@seqio.map_over_dataset
-def convert_figureqa_answer(ex):
-    keys_tensor = tf.constant(["0", "1"])
-    values_tensor = tf.constant(["no", "yes"])
-    table = tf.lookup.StaticHashTable(
-        tf.lookup.KeyValueTensorInitializer(keys_tensor, values_tensor),
-        default_value=tf.constant("nan", dtype=tf.string),
-    )
-    answer = table.lookup(ex["answer"])
-    ex["answer"] = answer
-    return ex
-@seqio.map_over_dataset()
-def build_question_with_hint(ex):
-    hint = ex["hint"]
-    if tf.strings.length(hint) > 0:
-        ex["question"] = tf.strings.join([hint, ex["question"]], separator="\n")
-    return ex
-@seqio.map_over_dataset()
-def build_question_with_context(ex):
-    context = ex["context"]
-    if tf.strings.length(context) > 0:
-        ex["question"] = tf.strings.join([context, ex["question"]], separator="\n")
-    return ex
-def max_words(ds, max_words):
-    return ds.filter(lambda x: x["n_words"] <= max_words)
-@seqio.map_over_dataset
-def format_pdfa_eng_wds(example):
-    return dict(
-        image=example["image"],
-        text=tf.strings.reduce_join(example["lines"]["text"], separator="\n"),
-    )
-@gin.configurable()
-def accuracy_conditioned_joint(ds, sequence_length, is_eval=False, eval_quality=17,
-                               transcript_quality=None):
-    # v2: Transcripts no longer get a quality score
-    is_training = sequence_length.get('is_training', True)
-    if not is_training:
-        if is_eval:
-            prompt = f"quality {eval_quality}:"
-        else:
-            prompt = f"quality 17:"
-        @seqio.map_over_dataset
-        def _with_prompt(ex):
-            out = dict(
-                image=ex["image"],
-                url=ex["url"],
-                prompt=prompt,
-            )
-            if "text" in ex:
-                out["text"] = ex["text"]
-            elif "caption" in ex:
-                out["text"] = ex["caption"]
-            return out
-        return _with_prompt(ds)
-    elif is_eval:
-        raise ValueError("is_eval=True and is_training=False")
-    # each transcript
-    @seqio.map_over_dataset
-    def _with_transcript(ex):
-        if tf.shape(ex["edited_captions"]["caption"])[0] > 0:
-            edited_caption = ex["edited_captions"]["caption"][0]
-            n = ex["edited_captions"]["n_edits"][0]
-        else:
-            edited_caption = ""
-            n = 0
-        text = [
-            ex["caption"],
-            ex["transcripts"][tf.random.uniform((), 0, tf.shape(ex["transcripts"])[0], dtype=tf.int32)],
-            edited_caption
-        ]
-        edit_quality = 17 - n
-        prompt = [
-            "quality 17:",
-            "" if transcript_quality is None else f"quality: {edit_quality}:",
-            tf.strings.join(["quality ", tf.strings.as_string(edit_quality), ":"])
-        ]
-        return dict(
-            image=ex["image"],
-            text=tf.stack(text, 0),
-            url=ex["url"],
-            prompt=tf.stack(prompt, 0),
-            style=["long_caption", "transcript", "long_caption"]
-        )
-    return _with_transcript(ds)
-def select_dense_caption_sample(ds, samples=200):
-    def compute_hash(string: str) -> str:
-        return hashlib.sha256(string.encode("utf-8")).hexdigest()
-    with tf.io.gfile.GFile("gs://oe-training-chrisc/cockatoo/data/dense-caption-eval-v0-final-data.json") as f:
-        data = json.load(f)
-    for ex in data:
-        ex["image_id"] = compute_hash(ex["image"])
-    data.sort(key=lambda x: x["image_id"])
-    np.random.RandomState(12312).shuffle(data)
-    keep = tf.constant([x["image"] for x in data[:samples]])
-    def _keep(ex):
-        return tf.reduce_any(ex["url"] == keep)
-    ds = ds.filter(_keep)
-    ds = tf.data.experimental.assert_cardinality(samples)(ds)
-    return ds
-@seqio.map_over_dataset()
-def charxiv_preprocessor(ex):
-    question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4", "reasoning_q"]
-    answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4", "reasoning_a"]
-    questions = [ex[name] for name in question_names]
-    answers = [ex[name] for name in answer_names]
-    return dict(
-        image=ex["image"],
-        question=tf.stack(questions, 0),
-        answer=tf.stack(answers, 0)
-    )
-@seqio.map_over_dataset()
-def charxiv_descriptive_preprocessor(ex):
-    question_names = ["descriptive_q1", "descriptive_q2", "descriptive_q3", "descriptive_q4"]
-    answer_names = ["descriptive_a1", "descriptive_a2", "descriptive_a3", "descriptive_a4"]
-    questions = [ex[name] for name in question_names]
-    answers = [ex[name] for name in answer_names]
-    return dict(
-        image=ex["image"],
-        question=tf.stack(questions, 0),
-        answer=tf.stack(answers, 0)
-    )
-@seqio.map_over_dataset()
-def charxiv_reasoning_preprocessor(ex):
-    return dict(
-        image=ex["image"],
-        question=ex["reasoning_q"],
-        answer=ex["reasoning_a"]
-    )
-@seqio.map_over_dataset()
-def tablevqa_preprocessor(ex):
-    return dict(
-        image=ex["image"],
-        question=ex["question"],
-        answer=ex["gt"]
-    )
-@seqio.map_over_dataset()
-def vtabfact_preprocessor(ex):
-    return dict(
-        image=ex["image"],
-        question=tf.strings.join([ex["question"], "Answer with yes or no."], separator="\n"),
-        answer=ex["gt"]
-    )
-@seqio.map_over_dataset()
-def nutrition_fact_preprocessor(ex):
-    question_names = ["descriptive_q", "reasoning_q"]
-    answer_names = ["descriptive_a", "reasoning_a"]
-    questions = [ex[name] for name in question_names]
-    answers = [ex[name] for name in answer_names]
-    return dict(
-        image=ex["image"],
-        question=tf.stack(questions, 0),
-        answer=tf.stack(answers, 0)
-    )