japanese-clip-vit-h-14-bert-base / tokenization_custom_clip.py

Upload processor

5af068e verified 8 months ago

2.66 kB

	# coding=utf-8

	# Modified from rinna
	# https://github.com/rinnakk/japanese-clip/blob/master/src/japanese_clip/tokenizer.py

	# ################################## COPIED ##################################
	# Copyright 2022 rinna Co., Ltd.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ################################## COPIED ##################################

	from typing import Union

	import torch
	from transformers import AutoTokenizer, T5Tokenizer


	class CustomCLIPTokenizer(T5Tokenizer):
	model_input_names = ["input_ids", "attention_mask", "position_ids"]

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	self.do_lower_case = True # due to some bug of tokenizer config loading

	def __call__(
	self,
	texts: Union[str, list[str]],
	tokenizer: T5Tokenizer = None,
	max_seq_len: int = 77,
	device: Union[str, torch.device] = (
	"cuda" if torch.cuda.is_available() else "cpu"
	),
	**kwargs,
	):
	if isinstance(texts, str):
	texts = [texts]
	if tokenizer is None:
	tokenizer = self
	tokenizer_call = super().__call__
	else:
	tokenizer_call = tokenizer
	inputs = tokenizer_call(
	texts,
	max_length=max_seq_len - 1,
	padding="max_length",
	truncation=True,
	add_special_tokens=False,
	)
	# add cls token at first place
	input_ids = [[tokenizer.cls_token_id] + ids for ids in inputs["input_ids"]]
	attention_mask = [[1] + am for am in inputs["attention_mask"]]
	position_ids = [list(range(0, len(input_ids[0])))] * len(texts)

	input_ids = torch.tensor(input_ids, dtype=torch.long)
	attention_mask = torch.tensor(attention_mask, dtype=torch.long)
	position_ids = torch.tensor(position_ids, dtype=torch.long)
	return {
	"input_ids": input_ids.to(device),
	"attention_mask": attention_mask.to(device),
	"position_ids": position_ids.to(device),
	}


	AutoTokenizer.register("CustomCLIPTokenizer", CustomCLIPTokenizer)