ja_sentence_segmenter.split.simple_splitter

Simple sentence splitter for japanese text.

  1"""Simple sentence splitter for japanese text."""
  2
  3import re
  4from typing import Generator, Iterator, List, Match, Union, overload
  5
  6BETWEEN_QUOTE_JA_REGEX = r"「[^「」]*」"
  7BETWEEN_PARENS_JA_REGEX = r"\([^()]*\)"
  8ESCAPE_CHAR = "∯"
  9DEFAULT_PUNCTUATION_REGEX = r"。!?"
 10"""default punctuation characters for splitting."""
 11
 12
 13def __split_newline_iter(texts: Iterator[str]) -> Generator[str, None, None]:
 14    for text in texts:
 15        for line in text.splitlines():
 16            yield line
 17
 18
 19@overload
 20def split_newline(arg: str) -> Generator[str, None, None]:
 21    ...
 22
 23
 24@overload
 25def split_newline(arg: List[str]) -> Generator[str, None, None]:
 26    ...
 27
 28
 29@overload
 30def split_newline(arg: Iterator[str]) -> Generator[str, None, None]:
 31    ...
 32
 33
 34def split_newline(arg: Union[str, List[str], Iterator[str]]) -> Generator[str, None, None]:
 35    """Split text with line boundaries.
 36
 37    Parameters
 38    ----------
 39    arg : Union[str, List[str], Iterator[str]]
 40        texts you want to split.
 41
 42    Yields
 43    ------
 44    Generator[str, None, None]
 45        texts splitted with line boundaries.
 46    """
 47    if isinstance(arg, str):
 48        yield from __split_newline_iter(iter([arg]))
 49    elif isinstance(arg, list):
 50        yield from __split_newline_iter(iter(arg))
 51    elif isinstance(arg, Iterator):
 52        yield from __split_newline_iter(arg)
 53
 54
 55def __split_punctuation_iter(texts: Iterator[str], punctuations: str, split_between_quote: bool, split_between_parens: bool) -> Generator[str, None, None]:
 56    def escape_between_punctuation(match: Match[str]) -> str:
 57        text = match.group()
 58        escapeRegex = rf"(?<!{ESCAPE_CHAR})([{punctuations}])(?!{ESCAPE_CHAR})"
 59        result = re.sub(escapeRegex, rf"{ESCAPE_CHAR}\1{ESCAPE_CHAR}", text)
 60        return result
 61
 62    def escape_between_quote(text: str) -> str:
 63        result = re.sub(BETWEEN_QUOTE_JA_REGEX, escape_between_punctuation, text)
 64        return result
 65
 66    def escape_between_parens(text: str) -> str:
 67        result = re.sub(BETWEEN_PARENS_JA_REGEX, escape_between_punctuation, text)
 68        return result
 69
 70    def sub_split_punctuation(text: str) -> List[str]:
 71        splitRegex = rf"(?<!{ESCAPE_CHAR})([{punctuations}])(?!{ESCAPE_CHAR})"
 72        result = re.sub(splitRegex, "\\1\n", text)
 73        unescapeRegex = rf"({ESCAPE_CHAR})([{punctuations}])({ESCAPE_CHAR})"
 74        result = re.sub(unescapeRegex, "\\2", result)
 75        return result.splitlines()
 76
 77    for text in texts:
 78        temp = text
 79        if not split_between_quote:
 80            temp = escape_between_quote(temp)
 81        if not split_between_parens:
 82            temp = escape_between_parens(temp)
 83        sentences = sub_split_punctuation(temp)
 84        for sentence in sentences:
 85            yield sentence
 86
 87
 88@overload
 89def split_punctuation(
 90    arg: str, punctuations: str = DEFAULT_PUNCTUATION_REGEX, split_between_quote: bool = False, split_between_parens: bool = False
 91) -> Generator[str, None, None]:
 92    ...
 93
 94
 95@overload
 96def split_punctuation(
 97    arg: List[str], punctuations: str = DEFAULT_PUNCTUATION_REGEX, split_between_quote: bool = False, split_between_parens: bool = False
 98) -> Generator[str, None, None]:
 99    ...
100
101
102@overload
103def split_punctuation(
104    arg: Iterator[str], punctuations: str = DEFAULT_PUNCTUATION_REGEX, split_between_quote: bool = False, split_between_parens: bool = False
105) -> Generator[str, None, None]:
106    ...
107
108
109def split_punctuation(
110    arg: Union[str, List[str], Iterator[str]],
111    punctuations: str = DEFAULT_PUNCTUATION_REGEX,
112    split_between_quote: bool = False,
113    split_between_parens: bool = False,
114) -> Generator[str, None, None]:
115    """Split text with puctuations.
116
117    Parameters
118    ----------
119    arg : Union[str, List[str], Iterator[str]]
120        texts you want to split
121    punctuations : str, optional
122        regular expression for puctuations, by default DEFAULT_PUNCTUATION_REGEX
123    split_between_quote : bool, optional
124        split if punctuation between quotes, by default False
125    split_between_parens : bool, optional
126        split if punctuation between parentheses, by default False
127
128    Yields
129    ------
130    Generator[str, None, None]
131        texts splitted with puctuations.
132    """
133    if isinstance(arg, str):
134        yield from __split_punctuation_iter(iter([arg]), punctuations, split_between_quote, split_between_parens)
135    elif isinstance(arg, list):
136        yield from __split_punctuation_iter(iter(arg), punctuations, split_between_quote, split_between_parens)
137    elif isinstance(arg, Iterator):
138        yield from __split_punctuation_iter(arg, punctuations, split_between_quote, split_between_parens)
DEFAULT_PUNCTUATION_REGEX = '。!?'

default punctuation characters for splitting.

def split_newline( arg: Union[str, List[str], Iterator[str]]) -> Generator[str, NoneType, NoneType]:
35def split_newline(arg: Union[str, List[str], Iterator[str]]) -> Generator[str, None, None]:
36    """Split text with line boundaries.
37
38    Parameters
39    ----------
40    arg : Union[str, List[str], Iterator[str]]
41        texts you want to split.
42
43    Yields
44    ------
45    Generator[str, None, None]
46        texts splitted with line boundaries.
47    """
48    if isinstance(arg, str):
49        yield from __split_newline_iter(iter([arg]))
50    elif isinstance(arg, list):
51        yield from __split_newline_iter(iter(arg))
52    elif isinstance(arg, Iterator):
53        yield from __split_newline_iter(arg)

Split text with line boundaries.

Parameters
  • arg (Union[str, List[str], Iterator[str]]): texts you want to split.
Yields
  • Generator[str, None, None]: texts splitted with line boundaries.
def split_punctuation( arg: Union[str, List[str], Iterator[str]], punctuations: str = '。!?', split_between_quote: bool = False, split_between_parens: bool = False) -> Generator[str, NoneType, NoneType]:
110def split_punctuation(
111    arg: Union[str, List[str], Iterator[str]],
112    punctuations: str = DEFAULT_PUNCTUATION_REGEX,
113    split_between_quote: bool = False,
114    split_between_parens: bool = False,
115) -> Generator[str, None, None]:
116    """Split text with puctuations.
117
118    Parameters
119    ----------
120    arg : Union[str, List[str], Iterator[str]]
121        texts you want to split
122    punctuations : str, optional
123        regular expression for puctuations, by default DEFAULT_PUNCTUATION_REGEX
124    split_between_quote : bool, optional
125        split if punctuation between quotes, by default False
126    split_between_parens : bool, optional
127        split if punctuation between parentheses, by default False
128
129    Yields
130    ------
131    Generator[str, None, None]
132        texts splitted with puctuations.
133    """
134    if isinstance(arg, str):
135        yield from __split_punctuation_iter(iter([arg]), punctuations, split_between_quote, split_between_parens)
136    elif isinstance(arg, list):
137        yield from __split_punctuation_iter(iter(arg), punctuations, split_between_quote, split_between_parens)
138    elif isinstance(arg, Iterator):
139        yield from __split_punctuation_iter(arg, punctuations, split_between_quote, split_between_parens)

Split text with puctuations.

Parameters
  • arg (Union[str, List[str], Iterator[str]]): texts you want to split
  • punctuations (str, optional): regular expression for puctuations, by default DEFAULT_PUNCTUATION_REGEX
  • split_between_quote (bool, optional): split if punctuation between quotes, by default False
  • split_between_parens (bool, optional): split if punctuation between parentheses, by default False
Yields
  • Generator[str, None, None]: texts splitted with puctuations.