from pathlib import Path from typing import List, Optional, Tuple import gradio as gr import numpy as np import torch from sudachipy import dictionary from sudachipy import tokenizer as sudachi_tokenizer from transformers import AutoModelForCausalLM, PreTrainedTokenizer, T5Tokenizer model_dir = Path(__file__).parents[0] / "model" device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") tokenizer = T5Tokenizer.from_pretrained(model_dir) tokenizer.do_lower_case = True trained_model = AutoModelForCausalLM.from_pretrained(model_dir) trained_model.to(device) # baseline model baseline_model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-medium") baseline_model.to(device) sudachi_tokenizer_obj = dictionary.Dictionary().create() mode = sudachi_tokenizer.Tokenizer.SplitMode.C def sudachi_tokenize(input_text: str) -> List[str]: morphemes = sudachi_tokenizer_obj.tokenize(input_text, mode) return [morpheme.surface() for morpheme in morphemes] def calc_offsets(tokens: List[str]) -> List[int]: offsets = [0] for token in tokens: offsets.append(offsets[-1] + len(token)) return offsets def distribute_surprisals_to_characters( tokens2surprisal: List[Tuple[str, float]] ) -> List[Tuple[str, float]]: tokens2surprisal_by_character: List[Tuple[str, float]] = [] for token, surprisal in tokens2surprisal: token_len = len(token) for character in token: tokens2surprisal_by_character.append((character, surprisal / token_len)) return tokens2surprisal_by_character def calculate_surprisals_by_character( input_text: str, model: AutoModelForCausalLM, tokenizer: PreTrainedTokenizer ) -> Tuple[float, List[Tuple[str, float]]]: input_tokens = [ token.replace("▁", "") for token in tokenizer.tokenize(input_text) if token != "▁" ] input_ids = tokenizer.encode( "" + input_text, add_special_tokens=False, return_tensors="pt" ).to(device) logits = model(input_ids)["logits"].squeeze(0) surprisals = [] for i in range(logits.shape[0] - 1): if input_ids[0][i + 1] == 9: continue logit = logits[i] prob = torch.softmax(logit, dim=0) neg_logprob = -torch.log(prob) surprisals.append(neg_logprob[input_ids[0][i + 1]].item()) mean_surprisal = np.mean(surprisals) tokens2surprisal: List[Tuple[str, float]] = [] for token, surprisal in zip(input_tokens, surprisals): tokens2surprisal.append((token, surprisal)) char2surprisal = distribute_surprisals_to_characters(tokens2surprisal) return mean_surprisal, char2surprisal def aggregate_surprisals_by_offset( char2surprisal: List[Tuple[str, float]], offsets: List[int] ) -> List[Tuple[str, float]]: tokens2surprisal = [] for i in range(len(offsets) - 1): start = offsets[i] end = offsets[i + 1] surprisal = sum([surprisal for _, surprisal in char2surprisal[start:end]]) token = "".join([char for char, _ in char2surprisal[start:end]]) tokens2surprisal.append((token, surprisal)) return tokens2surprisal def highlight_token(token: str, score: float): if score > 0: html_color = "#%02X%02X%02X" % ( 255, int(255 * (1 - score)), int(255 * (1 - score)), ) else: html_color = "#%02X%02X%02X" % ( int(255 * (1 + score)), int(255 * (1 + score)), 255, ) return '{}'.format( html_color, token ) def create_highlighted_text( label: str, tokens2scores: List[Tuple[str, float]], mean_surprisal: Optional[float] = None, ): if mean_surprisal is None: highlighted_text = "

" + label + "

" else: highlighted_text = ( "

" + label + f"(サプライザル平均値: {mean_surprisal:.3f})

" ) for token, score in tokens2scores: highlighted_text += highlight_token(token, score) return highlighted_text def normalize_surprisals( tokens2surprisal: List[Tuple[str, float]], log_scale: bool = False ) -> List[Tuple[str, float]]: if log_scale: surprisals = [np.log(surprisal) for _, surprisal in tokens2surprisal] else: surprisals = [surprisal for _, surprisal in tokens2surprisal] min_surprisal = np.min(surprisals) max_surprisal = np.max(surprisals) surprisals = [ (surprisal - min_surprisal) / (max_surprisal - min_surprisal) for surprisal in surprisals ] assert min(surprisals) >= 0 assert max(surprisals) <= 1 return [ (token, surprisal) for (token, _), surprisal in zip(tokens2surprisal, surprisals) ] def calculate_surprisal_diff( tokens2surprisal: List[Tuple[str, float]], baseline_tokens2surprisal: List[Tuple[str, float]], scale: float = 100.0, ): diff_tokens2surprisal = [ (token, (surprisal - baseline_surprisal) * 100) for (token, surprisal), (_, baseline_surprisal) in zip( tokens2surprisal, baseline_tokens2surprisal ) ] return diff_tokens2surprisal def main(input_text: str) -> Tuple[str, str, str]: mean_surprisal, char2surprisal = calculate_surprisals_by_character( input_text, trained_model, tokenizer ) offsets = calc_offsets(sudachi_tokenize(input_text)) tokens2surprisal = aggregate_surprisals_by_offset(char2surprisal, offsets) tokens2surprisal = normalize_surprisals(tokens2surprisal) highlighted_text = create_highlighted_text( "学習後モデル", tokens2surprisal, mean_surprisal ) ( baseline_mean_surprisal, baseline_char2surprisal, ) = calculate_surprisals_by_character(input_text, baseline_model, tokenizer) baseline_tokens2surprisal = aggregate_surprisals_by_offset( baseline_char2surprisal, offsets ) baseline_tokens2surprisal = normalize_surprisals(baseline_tokens2surprisal) baseline_highlighted_text = create_highlighted_text( "学習前モデル", baseline_tokens2surprisal, baseline_mean_surprisal ) diff_tokens2surprisal = calculate_surprisal_diff( tokens2surprisal, baseline_tokens2surprisal, 100.0 ) diff_highlighted_text = create_highlighted_text( "学習前後の差分", diff_tokens2surprisal, None ) return ( baseline_highlighted_text, highlighted_text, diff_highlighted_text, ) if __name__ == "__main__": demo = gr.Interface( fn=main, title="文章の読みやすさを自動評価するAI", description="文章を入力すると、読みづらい表現は赤く、読みやすい表現は青くハイライトされて出力されます。", show_label=True, inputs=gr.Textbox( lines=5, label="文章", placeholder="ここに文章を入力してください。", ), outputs=[ gr.HTML(label="学習前モデル", show_label=True), gr.HTML(label="学習後モデル", show_label=True), gr.HTML(label="学習前後の差分", show_label=True), ], examples=[ "太郎が二郎を殴った。", "太郎が二郎に殴った。", "サイエンスインパクトラボは、国立研究開発法人科学技術振興機構(JST)の「科学と社会」推進部が行う共創プログラムです。「先端の研究開発を行う研究者」と「社会課題解決に取り組むプレイヤー」が約3ヶ月に渡って共創活動を行います。", "近年、ニューラル言語モデルが自然言語の統語知識をどれほど有しているかを、容認性判断課題を通して検証する研究が行われてきている。しかし、このような言語モデルの統語的評価を行うためのデータセットは、主に英語を中心とした欧米の諸言語を対象に構築されてきた。本研究では、既存のデータセットの問題点を克服しつつ、このようなデータセットが構築されてこなかった日本語を対象とした初めてのデータセットである JCoLA (JapaneseCorpus of Linguistic Acceptability) を構築した上で、それを用いた言語モデルの統語的評価を行った。", ], ) demo.launch()