ja_sentence_segmenter.concatenate.simple_concatenator

Simple sentence concatenator for japanese text.

 1"""Simple sentence concatenator for japanese text."""
 2import re
 3from typing import Generator, Iterator, List, Optional, Union, overload
 4
 5
 6def __concatenate_matching_iter(
 7    texts: Iterator[str], former_matching_rule: Optional[str], latter_matching_rule: Optional[str], remove_former_matched: bool, remove_latter_matched: bool
 8) -> Generator[str, None, None]:
 9    try:
10        former = next(texts)
11
12        for latter in texts:
13            former_match_obj = re.match(former_matching_rule, former) if former_matching_rule else None
14            latter_match_obj = re.match(latter_matching_rule, latter) if latter_matching_rule else None
15
16            if former_matching_rule and latter_matching_rule and former_match_obj and latter_match_obj:
17                tmp_former = former_match_obj.group("result") if remove_former_matched else former
18                tmp_latter = latter_match_obj.group("result") if remove_latter_matched else latter
19                former = tmp_former + tmp_latter
20            elif former_matching_rule and not latter_matching_rule and former_match_obj:
21                tmp_former = former_match_obj.group("result") if remove_former_matched else former
22                former = tmp_former + latter
23            elif not former_matching_rule and latter_matching_rule and latter_match_obj:
24                tmp_latter = latter_match_obj.group("result") if remove_latter_matched else latter
25                former += tmp_latter
26            else:
27                yield former
28                former = latter
29
30        yield former
31    except StopIteration:
32        pass
33
34
35@overload
36def concatenate_matching(
37    arg: List[str],
38    former_matching_rule: Optional[str] = None,
39    latter_matching_rule: Optional[str] = None,
40    remove_former_matched: bool = True,
41    remove_latter_matched: bool = True,
42) -> Generator[str, None, None]:
43    ...
44
45
46@overload
47def concatenate_matching(
48    arg: Iterator[str],
49    former_matching_rule: Optional[str] = None,
50    latter_matching_rule: Optional[str] = None,
51    remove_former_matched: bool = True,
52    remove_latter_matched: bool = True,
53) -> Generator[str, None, None]:
54    ...
55
56
57def concatenate_matching(
58    arg: Union[str, List[str], Iterator[str]],
59    former_matching_rule: Optional[str] = None,
60    latter_matching_rule: Optional[str] = None,
61    remove_former_matched: bool = True,
62    remove_latter_matched: bool = True,
63) -> Generator[str, None, None]:
64    r"""Concatenate two lines with regular expression rule.
65
66    Parameters
67    ----------
68    arg : Union[str, List[str], Iterator[str]]
69        texts you want to concatenate.
70    former_matching_rule : Optional[str], optional
71        regular expression for former line, by default None
72    latter_matching_rule : Optional[str], optional
73        regular expression for latter line, by default None
74    remove_former_matched : bool, optional
75        whether to remove matched place of former line, by default True.
76        if this is True, former_matching_rule must contain named group 'result',
77        only that group remains.
78        e.g. r"^(\s*[>]+\s*)(?P<result>.+)$"
79    remove_latter_matched : bool, optional
80        whether to remove matched place of latter line, by default True.
81        if this is True, latter_matching_rule must contain named group 'result',
82        only that group remains.
83        e.g. r"^(\s*[>]+\s*)(?P<result>.+)$"
84
85    Yields
86    ------
87    Generator[str, None, None]
88        concatenated texts.
89    """
90    if isinstance(arg, list):
91        yield from __concatenate_matching_iter(iter(arg), former_matching_rule, latter_matching_rule, remove_former_matched, remove_latter_matched)
92    elif isinstance(arg, Iterator):
93        yield from __concatenate_matching_iter(arg, former_matching_rule, latter_matching_rule, remove_former_matched, remove_latter_matched)
def concatenate_matching( arg: Union[str, List[str], Iterator[str]], former_matching_rule: Optional[str] = None, latter_matching_rule: Optional[str] = None, remove_former_matched: bool = True, remove_latter_matched: bool = True) -> Generator[str, NoneType, NoneType]:
58def concatenate_matching(
59    arg: Union[str, List[str], Iterator[str]],
60    former_matching_rule: Optional[str] = None,
61    latter_matching_rule: Optional[str] = None,
62    remove_former_matched: bool = True,
63    remove_latter_matched: bool = True,
64) -> Generator[str, None, None]:
65    r"""Concatenate two lines with regular expression rule.
66
67    Parameters
68    ----------
69    arg : Union[str, List[str], Iterator[str]]
70        texts you want to concatenate.
71    former_matching_rule : Optional[str], optional
72        regular expression for former line, by default None
73    latter_matching_rule : Optional[str], optional
74        regular expression for latter line, by default None
75    remove_former_matched : bool, optional
76        whether to remove matched place of former line, by default True.
77        if this is True, former_matching_rule must contain named group 'result',
78        only that group remains.
79        e.g. r"^(\s*[>]+\s*)(?P<result>.+)$"
80    remove_latter_matched : bool, optional
81        whether to remove matched place of latter line, by default True.
82        if this is True, latter_matching_rule must contain named group 'result',
83        only that group remains.
84        e.g. r"^(\s*[>]+\s*)(?P<result>.+)$"
85
86    Yields
87    ------
88    Generator[str, None, None]
89        concatenated texts.
90    """
91    if isinstance(arg, list):
92        yield from __concatenate_matching_iter(iter(arg), former_matching_rule, latter_matching_rule, remove_former_matched, remove_latter_matched)
93    elif isinstance(arg, Iterator):
94        yield from __concatenate_matching_iter(arg, former_matching_rule, latter_matching_rule, remove_former_matched, remove_latter_matched)

Concatenate two lines with regular expression rule.

Parameters
  • arg (Union[str, List[str], Iterator[str]]): texts you want to concatenate.
  • former_matching_rule (Optional[str], optional): regular expression for former line, by default None
  • latter_matching_rule (Optional[str], optional): regular expression for latter line, by default None
  • remove_former_matched (bool, optional): whether to remove matched place of former line, by default True. if this is True, former_matching_rule must contain named group 'result', only that group remains. e.g. r"^(\s[>]+\s)(?P.+)$"
  • remove_latter_matched (bool, optional): whether to remove matched place of latter line, by default True. if this is True, latter_matching_rule must contain named group 'result', only that group remains. e.g. r"^(\s[>]+\s)(?P.+)$"
Yields
  • Generator[str, None, None]: concatenated texts.