ja_sentence_segmenter.normalize.neologd_normalizer

テキストの正規化処理.

正規化のコードは以下を参考に一部修正を加えています。 https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast

  1"""テキストの正規化処理.
  2
  3正規化のコードは以下を参考に一部修正を加えています。
  4https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast
  5"""
  6from __future__ import unicode_literals
  7
  8import re
  9import unicodedata
 10from typing import Dict, Generator, Iterator, List, Union, overload
 11
 12
 13def __unicode_normalize(cls: str, s: str) -> str:
 14    pt = re.compile("([{}]+)".format(cls))
 15
 16    def norm(c: str) -> str:
 17        return unicodedata.normalize("NFKC", c) if pt.match(c) else c
 18
 19    s = "".join(norm(x) for x in re.split(pt, s))
 20    s = re.sub("-", "-", s)
 21    return s
 22
 23
 24def __remove_extra_spaces(s: str) -> str:
 25    s = re.sub("[  ]+", " ", s)
 26    blocks = "".join(
 27        (
 28            "\u4E00-\u9FFF",  # CJK UNIFIED IDEOGRAPHS
 29            "\u3040-\u309F",  # HIRAGANA
 30            "\u30A0-\u30FF",  # KATAKANA
 31            "\u3000-\u303F",  # CJK SYMBOLS AND PUNCTUATION
 32            "\uFF00-\uFFEF",  # HALFWIDTH AND FULLWIDTH FORMS
 33        )
 34    )
 35    basic_latin = "\u0000-\u007F"
 36
 37    def remove_space_between(cls1: str, cls2: str, s: str) -> str:
 38        p = re.compile("([{}]) ([{}])".format(cls1, cls2))
 39        while p.search(s):
 40            s = p.sub(r"\1\2", s)
 41        return s
 42
 43    s = remove_space_between(blocks, blocks, s)
 44    s = remove_space_between(blocks, basic_latin, s)
 45    s = remove_space_between(basic_latin, blocks, s)
 46    return s
 47
 48
 49def __normalize_neologd(s: str, remove_tildes: bool) -> str:
 50    s = s.strip()
 51    s = __unicode_normalize("0-9A-Za-z。-゚", s)
 52
 53    def maketrans(f: str, t: str) -> Dict[int, int]:
 54        return {ord(x): ord(y) for x, y in zip(f, t)}
 55
 56    s = re.sub("[˗֊‐‑‒–⁃⁻₋−]+", "-", s)  # normalize hyphens
 57    s = re.sub("[﹣-ー—―─━ー]+", "ー", s)  # normalize choonpus
 58    if remove_tildes:
 59        s = re.sub("[~∼∾〜〰~]", "", s)  # remove tildes (original)
 60    else:
 61        s = re.sub("[~∼∾〜〰~]", "~", s)  # normalize tildes (modified by wwwcojp)
 62
 63    s = s.translate(maketrans("!\"#$%&'()*+,-./:;<=>?@[¥]^_`{|}~。、・「」", "!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}〜。、・「」"))
 64
 65    s = __remove_extra_spaces(s)
 66    s = __unicode_normalize("!”#$%&’()*+,-./:;<>?@[¥]^_`{|}〜", s)  # keep =,・,「,」
 67    s = re.sub("[’]", "'", s)
 68    s = re.sub("[”]", '"', s)
 69    return s
 70
 71
 72def __normalize_iter(texts: Iterator[str], remove_tildes: bool) -> Generator[str, None, None]:
 73    for text in texts:
 74        yield __normalize_neologd(text, remove_tildes)
 75
 76
 77@overload
 78def normalize(arg: str, remove_tildes: bool = False) -> Generator[str, None, None]:
 79    ...
 80
 81
 82@overload
 83def normalize(arg: List[str], remove_tildes: bool = False) -> Generator[str, None, None]:
 84    ...
 85
 86
 87@overload
 88def normalize(arg: Iterator[str], remove_tildes: bool = False) -> Generator[str, None, None]:
 89    ...
 90
 91
 92def normalize(arg: Union[str, List[str], Iterator[str]], remove_tildes: bool = False) -> Generator[str, None, None]:
 93    """Normalize text with mecab-ipadic-neologd rules.
 94
 95    Parameters
 96    ----------
 97    arg : Union[str, List[str], Iterator[str]]
 98        texts you want to normalize.
 99    remove_tildes : bool, optional
100        whether to remove tildes, by default False
101
102    Yields
103    ------
104    Generator[str, None, None]
105        normalized texts.
106    """
107    if isinstance(arg, str):
108        yield from __normalize_iter(iter([arg]), remove_tildes)
109    elif isinstance(arg, list):
110        yield from __normalize_iter(iter(arg), remove_tildes)
111    elif isinstance(arg, Iterator):
112        yield from __normalize_iter(arg, remove_tildes)
def normalize( arg: Union[str, List[str], Iterator[str]], remove_tildes: bool = False) -> Generator[str, NoneType, NoneType]:
 93def normalize(arg: Union[str, List[str], Iterator[str]], remove_tildes: bool = False) -> Generator[str, None, None]:
 94    """Normalize text with mecab-ipadic-neologd rules.
 95
 96    Parameters
 97    ----------
 98    arg : Union[str, List[str], Iterator[str]]
 99        texts you want to normalize.
100    remove_tildes : bool, optional
101        whether to remove tildes, by default False
102
103    Yields
104    ------
105    Generator[str, None, None]
106        normalized texts.
107    """
108    if isinstance(arg, str):
109        yield from __normalize_iter(iter([arg]), remove_tildes)
110    elif isinstance(arg, list):
111        yield from __normalize_iter(iter(arg), remove_tildes)
112    elif isinstance(arg, Iterator):
113        yield from __normalize_iter(arg, remove_tildes)

Normalize text with mecab-ipadic-neologd rules.

Parameters
  • arg (Union[str, List[str], Iterator[str]]): texts you want to normalize.
  • remove_tildes (bool, optional): whether to remove tildes, by default False
Yields
  • Generator[str, None, None]: normalized texts.