huspacy
/

hu_core_news_md

Token Classification

Model card Files Files and versions Community

hu_core_news_md / lemma_postprocessing.py

oroszgy's picture

Update spacy pipeline to 3.8.0

aab9c36 verified 25 days ago

history blame contribute delete

3.35 kB

	"""
	This module contains various rule-based components aiming to improve on baseline lemmatization tools.
	"""

	import re
	from typing import List, Callable

	from spacy.lang.hu import Hungarian
	from spacy.pipeline import Pipe
	from spacy.tokens import Token
	from spacy.tokens.doc import Doc


	@Hungarian.component(
	"lemma_case_smoother",
	assigns=["token.lemma"],
	requires=["token.lemma", "token.pos"],
	)
	def lemma_case_smoother(doc: Doc) -> Doc:
	"""Smooth lemma casing by POS.

	DEPRECATED: This is not needed anymore, as the lemmatizer is now case-insensitive.

	Args:
	doc (Doc): Input document.

	Returns:
	Doc: Output document.
	"""
	for token in doc:
	if token.is_sent_start and token.tag_ != "PROPN":
	token.lemma_ = token.lemma_.lower()

	return doc


	class LemmaSmoother(Pipe):
	"""Smooths lemma by fixing common errors of the edit-tree lemmatizer."""

	_DATE_PATTERN = re.compile(r"(\d+)-j?[éá]?n?a?(t[őó]l)?")
	_NUMBER_PATTERN = re.compile(r"(\d+([-,/_.:]?(._)?\d+)*%?)")

	# noinspection PyUnusedLocal
	@staticmethod
	@Hungarian.factory("lemma_smoother", assigns=["token.lemma"], requires=["token.lemma", "token.pos"])
	def create_lemma_smoother(nlp: Hungarian, name: str) -> "LemmaSmoother":
	return LemmaSmoother()

	def __call__(self, doc: Doc) -> Doc:
	rules: List[Callable] = [
	self._remove_exclamation_marks,
	self._remove_question_marks,
	self._remove_date_suffixes,
	self._remove_suffix_after_numbers,
	]

	for token in doc:
	for rule in rules:
	rule(token)

	return doc

	@classmethod
	def _remove_exclamation_marks(cls, token: Token) -> None:
	"""Removes exclamation marks from the lemma.

	Args:
	token (Token): The original token.
	"""

	if "!" != token.lemma_:
	exclamation_mark_index = token.lemma_.find("!")
	if exclamation_mark_index != -1:
	token.lemma_ = token.lemma_[:exclamation_mark_index]

	@classmethod
	def _remove_question_marks(cls, token: Token) -> None:
	"""Removes question marks from the lemma.

	Args:
	token (Token): The original token.
	"""

	if "?" != token.lemma_:
	question_mark_index = token.lemma_.find("?")
	if question_mark_index != -1:
	token.lemma_ = token.lemma_[:question_mark_index]

	@classmethod
	def _remove_date_suffixes(cls, token: Token) -> None:
	"""Fixes the suffixes of dates.

	Args:
	token (Token): The original token.
	"""

	if token.pos_ == "NOUN":
	match = cls._DATE_PATTERN.match(token.lemma_)
	if match is not None:
	token.lemma_ = match.group(1) + "."

	@classmethod
	def _remove_suffix_after_numbers(cls, token: Token) -> None:
	"""Removes suffixes after numbers.

	Args:
	token (str): The original token.
	"""

	if token.pos_ == "NUM":
	match = cls._NUMBER_PATTERN.match(token.text)
	if match is not None:
	token.lemma_ = match.group(0)