ja_sentence_segmenter.normalize.neologd_normalizer
テキストの正規化処理.
正規化のコードは以下を参考に一部修正を加えています。 https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast
1"""テキストの正規化処理. 2 3正規化のコードは以下を参考に一部修正を加えています。 4https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast 5""" 6from __future__ import unicode_literals 7 8import re 9import unicodedata 10from typing import Dict, Generator, Iterator, List, Union, overload 11 12 13def __unicode_normalize(cls: str, s: str) -> str: 14 pt = re.compile("([{}]+)".format(cls)) 15 16 def norm(c: str) -> str: 17 return unicodedata.normalize("NFKC", c) if pt.match(c) else c 18 19 s = "".join(norm(x) for x in re.split(pt, s)) 20 s = re.sub("-", "-", s) 21 return s 22 23 24def __remove_extra_spaces(s: str) -> str: 25 s = re.sub("[ ]+", " ", s) 26 blocks = "".join( 27 ( 28 "\u4E00-\u9FFF", # CJK UNIFIED IDEOGRAPHS 29 "\u3040-\u309F", # HIRAGANA 30 "\u30A0-\u30FF", # KATAKANA 31 "\u3000-\u303F", # CJK SYMBOLS AND PUNCTUATION 32 "\uFF00-\uFFEF", # HALFWIDTH AND FULLWIDTH FORMS 33 ) 34 ) 35 basic_latin = "\u0000-\u007F" 36 37 def remove_space_between(cls1: str, cls2: str, s: str) -> str: 38 p = re.compile("([{}]) ([{}])".format(cls1, cls2)) 39 while p.search(s): 40 s = p.sub(r"\1\2", s) 41 return s 42 43 s = remove_space_between(blocks, blocks, s) 44 s = remove_space_between(blocks, basic_latin, s) 45 s = remove_space_between(basic_latin, blocks, s) 46 return s 47 48 49def __normalize_neologd(s: str, remove_tildes: bool) -> str: 50 s = s.strip() 51 s = __unicode_normalize("0-9A-Za-z。-゚", s) 52 53 def maketrans(f: str, t: str) -> Dict[int, int]: 54 return {ord(x): ord(y) for x, y in zip(f, t)} 55 56 s = re.sub("[˗֊‐‑‒–⁃⁻₋−]+", "-", s) # normalize hyphens 57 s = re.sub("[﹣-ー—―─━ー]+", "ー", s) # normalize choonpus 58 if remove_tildes: 59 s = re.sub("[~∼∾〜〰~]", "", s) # remove tildes (original) 60 else: 61 s = re.sub("[~∼∾〜〰~]", "~", s) # normalize tildes (modified by wwwcojp) 62 63 s = s.translate(maketrans("!\"#$%&'()*+,-./:;<=>?@[¥]^_`{|}~。、・「」", "!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}〜。、・「」")) 64 65 s = __remove_extra_spaces(s) 66 s = __unicode_normalize("!”#$%&’()*+,-./:;<>?@[¥]^_`{|}〜", s) # keep =,・,「,」 67 s = re.sub("[’]", "'", s) 68 s = re.sub("[”]", '"', s) 69 return s 70 71 72def __normalize_iter(texts: Iterator[str], remove_tildes: bool) -> Generator[str, None, None]: 73 for text in texts: 74 yield __normalize_neologd(text, remove_tildes) 75 76 77@overload 78def normalize(arg: str, remove_tildes: bool = False) -> Generator[str, None, None]: 79 ... 80 81 82@overload 83def normalize(arg: List[str], remove_tildes: bool = False) -> Generator[str, None, None]: 84 ... 85 86 87@overload 88def normalize(arg: Iterator[str], remove_tildes: bool = False) -> Generator[str, None, None]: 89 ... 90 91 92def normalize(arg: Union[str, List[str], Iterator[str]], remove_tildes: bool = False) -> Generator[str, None, None]: 93 """Normalize text with mecab-ipadic-neologd rules. 94 95 Parameters 96 ---------- 97 arg : Union[str, List[str], Iterator[str]] 98 texts you want to normalize. 99 remove_tildes : bool, optional 100 whether to remove tildes, by default False 101 102 Yields 103 ------ 104 Generator[str, None, None] 105 normalized texts. 106 """ 107 if isinstance(arg, str): 108 yield from __normalize_iter(iter([arg]), remove_tildes) 109 elif isinstance(arg, list): 110 yield from __normalize_iter(iter(arg), remove_tildes) 111 elif isinstance(arg, Iterator): 112 yield from __normalize_iter(arg, remove_tildes)
def
normalize( arg: Union[str, List[str], Iterator[str]], remove_tildes: bool = False) -> Generator[str, NoneType, NoneType]:
93def normalize(arg: Union[str, List[str], Iterator[str]], remove_tildes: bool = False) -> Generator[str, None, None]: 94 """Normalize text with mecab-ipadic-neologd rules. 95 96 Parameters 97 ---------- 98 arg : Union[str, List[str], Iterator[str]] 99 texts you want to normalize. 100 remove_tildes : bool, optional 101 whether to remove tildes, by default False 102 103 Yields 104 ------ 105 Generator[str, None, None] 106 normalized texts. 107 """ 108 if isinstance(arg, str): 109 yield from __normalize_iter(iter([arg]), remove_tildes) 110 elif isinstance(arg, list): 111 yield from __normalize_iter(iter(arg), remove_tildes) 112 elif isinstance(arg, Iterator): 113 yield from __normalize_iter(arg, remove_tildes)
Normalize text with mecab-ipadic-neologd rules.
Parameters
- arg (Union[str, List[str], Iterator[str]]): texts you want to normalize.
- remove_tildes (bool, optional): whether to remove tildes, by default False
Yields
- Generator[str, None, None]: normalized texts.