ja_sentence_segmenter.split.simple_splitter
Simple sentence splitter for japanese text.
1"""Simple sentence splitter for japanese text.""" 2 3import re 4from typing import Generator, Iterator, List, Match, Union, overload 5 6BETWEEN_QUOTE_JA_REGEX = r"「[^「」]*」" 7BETWEEN_PARENS_JA_REGEX = r"\([^()]*\)" 8ESCAPE_CHAR = "∯" 9DEFAULT_PUNCTUATION_REGEX = r"。!?" 10"""default punctuation characters for splitting.""" 11 12 13def __split_newline_iter(texts: Iterator[str]) -> Generator[str, None, None]: 14 for text in texts: 15 for line in text.splitlines(): 16 yield line 17 18 19@overload 20def split_newline(arg: str) -> Generator[str, None, None]: 21 ... 22 23 24@overload 25def split_newline(arg: List[str]) -> Generator[str, None, None]: 26 ... 27 28 29@overload 30def split_newline(arg: Iterator[str]) -> Generator[str, None, None]: 31 ... 32 33 34def split_newline(arg: Union[str, List[str], Iterator[str]]) -> Generator[str, None, None]: 35 """Split text with line boundaries. 36 37 Parameters 38 ---------- 39 arg : Union[str, List[str], Iterator[str]] 40 texts you want to split. 41 42 Yields 43 ------ 44 Generator[str, None, None] 45 texts splitted with line boundaries. 46 """ 47 if isinstance(arg, str): 48 yield from __split_newline_iter(iter([arg])) 49 elif isinstance(arg, list): 50 yield from __split_newline_iter(iter(arg)) 51 elif isinstance(arg, Iterator): 52 yield from __split_newline_iter(arg) 53 54 55def __split_punctuation_iter(texts: Iterator[str], punctuations: str, split_between_quote: bool, split_between_parens: bool) -> Generator[str, None, None]: 56 def escape_between_punctuation(match: Match[str]) -> str: 57 text = match.group() 58 escapeRegex = rf"(?<!{ESCAPE_CHAR})([{punctuations}])(?!{ESCAPE_CHAR})" 59 result = re.sub(escapeRegex, rf"{ESCAPE_CHAR}\1{ESCAPE_CHAR}", text) 60 return result 61 62 def escape_between_quote(text: str) -> str: 63 result = re.sub(BETWEEN_QUOTE_JA_REGEX, escape_between_punctuation, text) 64 return result 65 66 def escape_between_parens(text: str) -> str: 67 result = re.sub(BETWEEN_PARENS_JA_REGEX, escape_between_punctuation, text) 68 return result 69 70 def sub_split_punctuation(text: str) -> List[str]: 71 splitRegex = rf"(?<!{ESCAPE_CHAR})([{punctuations}])(?!{ESCAPE_CHAR})" 72 result = re.sub(splitRegex, "\\1\n", text) 73 unescapeRegex = rf"({ESCAPE_CHAR})([{punctuations}])({ESCAPE_CHAR})" 74 result = re.sub(unescapeRegex, "\\2", result) 75 return result.splitlines() 76 77 for text in texts: 78 temp = text 79 if not split_between_quote: 80 temp = escape_between_quote(temp) 81 if not split_between_parens: 82 temp = escape_between_parens(temp) 83 sentences = sub_split_punctuation(temp) 84 for sentence in sentences: 85 yield sentence 86 87 88@overload 89def split_punctuation( 90 arg: str, punctuations: str = DEFAULT_PUNCTUATION_REGEX, split_between_quote: bool = False, split_between_parens: bool = False 91) -> Generator[str, None, None]: 92 ... 93 94 95@overload 96def split_punctuation( 97 arg: List[str], punctuations: str = DEFAULT_PUNCTUATION_REGEX, split_between_quote: bool = False, split_between_parens: bool = False 98) -> Generator[str, None, None]: 99 ... 100 101 102@overload 103def split_punctuation( 104 arg: Iterator[str], punctuations: str = DEFAULT_PUNCTUATION_REGEX, split_between_quote: bool = False, split_between_parens: bool = False 105) -> Generator[str, None, None]: 106 ... 107 108 109def split_punctuation( 110 arg: Union[str, List[str], Iterator[str]], 111 punctuations: str = DEFAULT_PUNCTUATION_REGEX, 112 split_between_quote: bool = False, 113 split_between_parens: bool = False, 114) -> Generator[str, None, None]: 115 """Split text with puctuations. 116 117 Parameters 118 ---------- 119 arg : Union[str, List[str], Iterator[str]] 120 texts you want to split 121 punctuations : str, optional 122 regular expression for puctuations, by default DEFAULT_PUNCTUATION_REGEX 123 split_between_quote : bool, optional 124 split if punctuation between quotes, by default False 125 split_between_parens : bool, optional 126 split if punctuation between parentheses, by default False 127 128 Yields 129 ------ 130 Generator[str, None, None] 131 texts splitted with puctuations. 132 """ 133 if isinstance(arg, str): 134 yield from __split_punctuation_iter(iter([arg]), punctuations, split_between_quote, split_between_parens) 135 elif isinstance(arg, list): 136 yield from __split_punctuation_iter(iter(arg), punctuations, split_between_quote, split_between_parens) 137 elif isinstance(arg, Iterator): 138 yield from __split_punctuation_iter(arg, punctuations, split_between_quote, split_between_parens)
DEFAULT_PUNCTUATION_REGEX =
'。!?'
default punctuation characters for splitting.
def
split_newline( arg: Union[str, List[str], Iterator[str]]) -> Generator[str, NoneType, NoneType]:
35def split_newline(arg: Union[str, List[str], Iterator[str]]) -> Generator[str, None, None]: 36 """Split text with line boundaries. 37 38 Parameters 39 ---------- 40 arg : Union[str, List[str], Iterator[str]] 41 texts you want to split. 42 43 Yields 44 ------ 45 Generator[str, None, None] 46 texts splitted with line boundaries. 47 """ 48 if isinstance(arg, str): 49 yield from __split_newline_iter(iter([arg])) 50 elif isinstance(arg, list): 51 yield from __split_newline_iter(iter(arg)) 52 elif isinstance(arg, Iterator): 53 yield from __split_newline_iter(arg)
Split text with line boundaries.
Parameters
- arg (Union[str, List[str], Iterator[str]]): texts you want to split.
Yields
- Generator[str, None, None]: texts splitted with line boundaries.
def
split_punctuation( arg: Union[str, List[str], Iterator[str]], punctuations: str = '。!?', split_between_quote: bool = False, split_between_parens: bool = False) -> Generator[str, NoneType, NoneType]:
110def split_punctuation( 111 arg: Union[str, List[str], Iterator[str]], 112 punctuations: str = DEFAULT_PUNCTUATION_REGEX, 113 split_between_quote: bool = False, 114 split_between_parens: bool = False, 115) -> Generator[str, None, None]: 116 """Split text with puctuations. 117 118 Parameters 119 ---------- 120 arg : Union[str, List[str], Iterator[str]] 121 texts you want to split 122 punctuations : str, optional 123 regular expression for puctuations, by default DEFAULT_PUNCTUATION_REGEX 124 split_between_quote : bool, optional 125 split if punctuation between quotes, by default False 126 split_between_parens : bool, optional 127 split if punctuation between parentheses, by default False 128 129 Yields 130 ------ 131 Generator[str, None, None] 132 texts splitted with puctuations. 133 """ 134 if isinstance(arg, str): 135 yield from __split_punctuation_iter(iter([arg]), punctuations, split_between_quote, split_between_parens) 136 elif isinstance(arg, list): 137 yield from __split_punctuation_iter(iter(arg), punctuations, split_between_quote, split_between_parens) 138 elif isinstance(arg, Iterator): 139 yield from __split_punctuation_iter(arg, punctuations, split_between_quote, split_between_parens)
Split text with puctuations.
Parameters
- arg (Union[str, List[str], Iterator[str]]): texts you want to split
- punctuations (str, optional): regular expression for puctuations, by default DEFAULT_PUNCTUATION_REGEX
- split_between_quote (bool, optional): split if punctuation between quotes, by default False
- split_between_parens (bool, optional): split if punctuation between parentheses, by default False
Yields
- Generator[str, None, None]: texts splitted with puctuations.