koCSN_SAPR / utils /ner_utils.py

Upload 13 files

bcb1848 verified 6 months ago

No virus

12.9 kB

	"""
	NER 모델을 이용하여 작업하는 코드입니다.
	"""
	import re
	import torch
	import numpy as np
	from collections import Counter

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	def ner_tokenizer(text, max_seq_length, checkpoint):
	"""
	NER을 위해 텍스트를 토큰화합니다.
	Args:
	sent: 처리하고자 하는 텍스트를 입력받습니다.
	max_seq_length: BERT의 config에서 처리 가능한 최대 문자열 길이는 512입니다. 최대 길이를 넘어서지 않도록, 텍스트 길이가 512를 넘어갈 경우 여러 개의 문자열로 분리합니다.
	문맥 정보를 고려하므로 가능한 긴 길이로 chunking하는 것이 좋은 성능을 보장할 수 있습니다.
	checkpoint: NER 모델에 대한 정보를 불러들입니다.
	Return:
	ner_tokenizer_dict: 아래 세 요소를 포함한 딕셔너리입니다.
	input_ids: 각 토큰의 모델 딕셔너리에서의 아이디값입니다.
	attention_mask: 각 토큰의 어탠션 마스크 활성화 여부입니다.
	token_type_ids: 개체명 인식 된 토큰의 경우 그 타입의 아이디(숫자 조합)를 반환합니다.
	"""
	#저장된 모델의 토크나이저를 불러옵니다.
	tokenizer = checkpoint['tokenizer']

	#각각 패딩, 문장 시작, 문장 끝을 나타내는 특별한 토큰들의 ID 값들을 가져옵니다.
	pad_token_id = tokenizer.pad_token_id
	cls_token_id = tokenizer.cls_token_id
	sep_token_id = tokenizer.sep_token_id

	#이전 음절을 저장하는 변수를 초기화합니다.
	pre_syllable = "_"

	#토크나이징된 결과를 저장할 리스트들을 초기화합니다.
	input_ids = [pad_token_id] * (max_seq_length - 1)
	attention_mask = [0] * (max_seq_length - 1)
	token_type_ids = [0] * max_seq_length

	#입력된 텍스트를 최대 시퀀스 길이에 맞게 잘라냅니다.
	text = text[:max_seq_length-2]

	#텍스트의 각 음절에 대해 반복문을 실행합니다.
	for i, syllable in enumerate(text):
	if syllable == '_':
	pre_syllable = syllable
	if pre_syllable != "_":
	syllable = '##' + syllable
	pre_syllable = syllable

	#토큰을 모델의 단어 사전에 있는 ID 값으로 변환하여 input_ids 리스트에 저장합니다.
	input_ids[i] = tokenizer.convert_tokens_to_ids(syllable)
	#해당 위치의 토큰에 대한 어텐션 마스크를 활성화합니다.
	attention_mask[i] = 1

	#입력 시퀀스의 시작에는 cls_token_id를, 끝에는 sep_token_id를 추가합니다.
	input_ids = [cls_token_id] + input_ids[:-1] + [sep_token_id]
	#어텐션 마스크도 시작과 끝 토큰을 고려하여 수정합니다.
	attention_mask = [1] + attention_mask[:-1] + [1]

	ner_tokenizer_dict = {"input_ids": input_ids,
	"attention_mask": attention_mask,
	"token_type_ids": token_type_ids}

	return ner_tokenizer_dict

	def get_ner_predictions(text, checkpoint):
	"""
	토큰화한 문장(tokenized_sent)과 예측한 태그(pred_tags) 값을 만드는 함수입니다.
	Args:
	text: NER 예측을 필요로 하는 텍스트를 입력합니다.
	checkpoint: 저장한 모델을 불러들입니다.
	Returns:
	tokenized_sent: 모델 입력을 위한 토큰화된 문장 정보입니다.
	pred_tags: 각 토큰에 대한 예측된 태그들을 포함합니다.
	"""
	#저장한 모델을 불러들입니다.
	model = checkpoint['model']
	#태그와 해당 태그의 ID 매핑 정보를 가져옵니다.
	tag2id = checkpoint['tag2id']
	model.to(device)
	#입력된 텍스트에서 공백을 언더스코어(_)로 대체합니다.
	text = text.replace(' ', '_')

	#예측값과 실제 라벨을 저장할 빈 리스트를 생성합니다.
	predictions, true_labels = [], []

	#ner_tokenizer 함수를 사용하여 텍스트를 토큰화합니다.
	tokenized_sent = ner_tokenizer(text, len(text) + 2, checkpoint)

	#토큰화된 결과를 토대로 텐서로 변환하여 모델 입력 형식에 맞게 준비합니다.
	input_ids = torch.tensor(
	tokenized_sent['input_ids']).unsqueeze(0).to(device)
	attention_mask = torch.tensor(
	tokenized_sent['attention_mask']).unsqueeze(0).to(device)
	token_type_ids = torch.tensor(
	tokenized_sent['token_type_ids']).unsqueeze(0).to(device)

	#그래디언트 계산을 수행하지 않기 위해 torch.no_grad() 컨텍스트 내에서 다음을 실행합니다. (eval 영역이기 때문에 학습을 하지 않습니다)
	with torch.no_grad():
	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids)

	#모델 출력에서 로짓 값을 가져와 Numpy값으로 변환하고, 라벨 ID들을 CPU 상의 NumPy 배열로 가져옵니다.
	logits = outputs['logits']
	logits = logits.detach().cpu().numpy()
	label_ids = token_type_ids.cpu().numpy()

	#예측된 라벨 값을 가져와서 리스트에 추가합니다.
	predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
	#실제 라벨을 리스트에 추가합니다.
	true_labels.append(label_ids)

	#예측된 라벨 ID를 실제 태그로 변환합니다.
	pred_tags = [list(tag2id.keys())[p_i] for p in predictions for p_i in p]

	return tokenized_sent, pred_tags


	def ner_inference(tokenized_sent, pred_tags, checkpoint, name_len=5) -> list:
	"""
	NER을 실행하고, 이름과 시간 및 공간 정보를 추출합니다.
	Args:
	tokenized_sent: 토큰화된 문장이 저장된 리스트
	pred_tags: 각 토큰에 대한 예측 태그값 (NER 결과)
	checkpoint: 저장해둔 모델을 불러옴
	name_len: 더 정확한 이름 인식을 위해 앞뒤로 몇 개의 음절을 더 검토할지 지정합니다.
	Returns:
	namelist: 추출한 이름(별칭 포함) 리스트입니다. 후처리를 통해
	scene: 추출한 장소 시간 사전입니다.
	"""
	name_list = []
	speaker = ''
	tokenizer = checkpoint['tokenizer']
	scene = {'장소': [], '시간': []}
	target = ''
	c_tag = None

	for i, tag in enumerate(pred_tags):
	token = tokenizer.convert_ids_to_tokens(tokenized_sent['input_ids'][i]).replace('#', '')
	if 'PER' in tag:
	if 'B' in tag and speaker != '':
	name_list.append(speaker)
	speaker = ''
	speaker += token

	elif speaker != '' and tag != pred_tags[i-1]:
	if speaker in name_list:
	name_list.append(speaker)
	else:
	tmp = speaker
	found_name = False
	# print(f'{speaker}에 의문이 생겨 확인해봅니다.')
	for j in range(name_len):
	if i + j < len(tokenized_sent['input_ids']):
	token = tokenizer.convert_ids_to_tokens(
	tokenized_sent['input_ids'][i+j]).replace('#', '')
	tmp += token
	# print(f'{speaker} 뒤로 나온 {j} 번째 까지 확인한결과, {tmp} 입니다')
	if tmp in name_list:
	name_list.append(tmp)
	found_name = True
	# print(f'명단에 {tmp} 가 존재하여, {speaker} 대신 추가하였습니다.')
	break

	if not found_name:
	name_list.append(speaker)
	# print(f'찾지 못하여 {speaker} 를 추가하였습니다.')
	speaker = ''

	elif tag != 'O':
	if tag.startswith('B'):
	if c_tag in ['TIM', 'DAT']:
	scene['시간'].append(target)
	elif c_tag =='LOC':
	scene['장소'].append(target)
	c_tag = tag[2:]
	target = token
	else:
	target += token.replace('_', ' ')

	return name_list, scene


	def make_name_list(ner_inputs, checkpoint):
	"""
	문장들을 NER 돌려서 Name List 만들기.
	"""
	name_list = []
	times = []
	places = []

	for ner_input in ner_inputs:
	tokenized_sent, pred_tags = get_ner_predictions(ner_input, checkpoint)
	names, scene = ner_inference(tokenized_sent, pred_tags, checkpoint)
	name_list.extend(names)
	times.extend(scene['시간'])
	places.extend(scene['장소'])

	return name_list, times, places


	def show_name_list(name_list):
	"""
	사용자 친화적으로 네임리스트를 보여줍니다.
	Arg:
	name_list: 추출한 이름 리스트
	Return:
	name: 동일한 이름이 몇 번 등장했는지 횟수를 함께 제공합니다.
	"""
	name = dict(Counter(name_list))

	return name


	def compare_strings(str1, str2):
	"""
	ner로 추출한 인명을 후처리하는 코드입니다.
	비교할 두 문자열의 길이가 다를 경우, 더 짧은 문자열이 더 긴 문자열에 포함되는지 확인합니다.
	비교할 두 문자열의 길이가 같을 경우, 겹치는 부분이 2글자 이상일 경우 같은 이름으로 지정합니다.
	이 함수와 아래의 combine_similar_names를 함께 실행하면, '한다정'과 '다정이', '다정이가' 등은 모두 하나의 인물로 묶을 수 있습니다.

	Args: 비교하려는 두 문자열
	Return: 두 문자열이 같은 이름으로 판단될 경우 True, 아닐 경우 False
	"""
	if len(str1) != len(str2):
	# 더 짧은 문자열이 더 긴 문자열에 포함되는지 확인
	shorter, longer = (str1, str2) if len(str1) < len(str2) else (str2, str1)
	if shorter in longer:
	return True
	else:
	same_part = []
	for i in range(len(str1)):
	if str1[i] in str2:
	same_part += str1[i]
	continue
	else:
	break
	if len(same_part) >= 2:
	return True

	return False

	def combine_similar_names(names_dict):
	"""
	compare_strings 함수를 바탕으로 유사한 이름을 함께 묶습니다.
	2글자는 이름일 확률이 높으니 기준점으로 지정합니다.
	"""
	names = names_dict.keys()
	similar_groups = [[name] for name in names if len(name) == 2]
	idx = 0
	# print(similar_groups, '\n',idx)

	for name in names:
	found = False
	for group in similar_groups:
	idx += 1
	for item in group:
	if compare_strings(name, item) and len(name)>1:
	found = True
	cleaned_text = re.sub(r'(아\|이)$', '', item)
	if len(name) == len(item):
	same_part = ''
	# 완전히 일치하는 부분이 있는지 확인
	for i in range(len(name)):
	if name[i] in item:
	same_part += name[i]
	if same_part not in group and cleaned_text not in group:
	group.append(cleaned_text)
	# print(similar_groups, '\n',idx, '문자열의 길이가 같을 때')
	else:
	group.append(name)
	# print(similar_groups, '\n',idx, '문자열의 길이가 다를 때')
	break
	if found:
	break
	if not found:
	similar_groups.append([name])

	updated_names = {tuple(name for name in group if len(name) > 1): counts for group, counts in (
	(group, sum(names_dict[name] for name in group if name != '')) for group in similar_groups)
	if len([name for name in group if len(name) > 1]) > 0}

	return updated_names

	def convert_name2codename(codename2name, text):
	"""RE를 이용하여 이름을 코드네임으로 변경합니다. 이때 각 코드네임의 번호는 빈도수 기준 내림차순한 결과입니다."""
	import re
	for n_list in codename2name.values():
	n_list.sort(key=lambda x:(len(x), x), reverse=True)

	for codename, n_list in codename2name.items():
	for subname in n_list:
	text = re.sub(subname, codename, text)

	return text


	def convert_codename2name(codename2name, text):
	"""코드네임을 이름으로 변경해줍니다."""
	outputs = []
	for i in text:
	try:
	outputs.append(codename2name[i][0])
	except:
	outputs.append('알 수 없음')

	return outputs