Skip to content

Text

A text that is broken down into individual Sentence objects on which analyses can be performed. This object uses caching to avoid performing expensive computations redundantly.

Source code in src/limes/text.py
class Text:
    """
    A text that is broken down into individual `Sentence` objects on which
    analyses can be performed.
    This object uses caching to avoid performing expensive computations
    redundantly.
    """

    def __init__(
        self,
        raw: str,
        analyzer: BaseAnalyzer,
        parser: Parser,
    ):
        """
        Create a `Text` object.

        Parameters
        ----------
        raw : str
            The string to be used as the bases if the text.
        analyzer : BaseAnalyzer
            The `BaseAnalyzer` used to perform barrier detection. This object
            contains the barrier analysis and complexity analysis logic.
        parser : Parser
            The `Parser` to be used for parsing relevant morphosyntactic
            information of the text.
        """
        self._raw = raw
        self._analyzer = analyzer
        self._parser = parser
        self._sentences: list[Sentence] | None = None

    @property
    def sentences(self) -> list[Sentence]:
        """
        A list of `Sentence` objects contained in the provided text.
        """
        if self._sentences is None:
            processed = self._parser(self._raw)
            self._sentences = [
                Sentence(sent, self._analyzer) for sent in processed.sents
            ]
        return self._sentences

    def __str__(self) -> str:
        """Return the raw text of the Text object."""
        return self._raw

    def __repr__(self) -> str:
        return f"Text({self._raw})"

    def __iter__(self):
        """
        Iterate over all `Sentence`s contained in the given text. The applied
        sentencization logic to split the `Text` into `Sentence` objects is
        determined by the `Parser` with which this `Text` was initialized.
        """
        yield from self.sentences

    def __getitem__(self, i: int) -> Sentence:
        """
        Return the i-th `Sentence` in the given text. The applied sentencization
        logic to split the `Text` into `Sentence` objects is determined by the
        `Parser` with which this `Text` was initialized.
        """
        return self.sentences[i]

    def __len__(self) -> int:
        """
        The length of the provided `Text` as determined by the number of
        `Sentence`s it contains.
        """
        return len(self.sentences)

    @property
    def barriers(self) -> list[Barrier]:
        """
        All barriers contained in the `Text`, as detected by the `Analyzer`
        attached to this `Text`.
        """
        barriers = []
        for sent in self:
            if sent.barriers is None:
                continue
            barriers.extend(sent.barriers)
        return barriers

    @property
    def local_complexities(self) -> list[tuple[SpanProtocol, float]]:
        """
        A list of syntactically coherent phrases that constitute the given
        sentence, as well as their respective calculated syntactic complexities.
        You can sum the local complexities to get a sound heuristic for the
        complexity of the complete sentence.
        """
        complexities: list[tuple[SpanProtocol, float]] = []
        for sent in self:
            complexities.extend(sent.local_complexities)
        return complexities

    def average_complexity(
        self,
        heuristic: ComplexityAlgorithm = ComplexityAlgorithm.AGGREGATED_LOCAL,
    ) -> float:
        """
        The complexity of the `Text` as a whole.

        Parameters
        ----------
        heuristic : ComplexityAlgorithm
            Determines which heuristic to use to calculate the complexity.
        """
        complexities = [sent.global_complexity(heuristic) for sent in self]
        return float(np.mean(complexities))

    def find(
        self,
        word: str,
        pos: PartOfSpeechTag,
        dehyphenate: bool = False,
        consider_fallback_tags: bool = True,
    ) -> list[TokenProtocol] | None:
        """
        All instances of the given word contained in the text, if any.

        Parameters
        ----------
        word : str
            The word to be searched for. You can pass in surface forms of words
            but using its lemma is suggested to reduce false negatives.
        pos : PartOfSpeechTag
            The part-of-speech tag of the word you are looking for.
        dehyphenate : bool (Optional)
            Whether or not to strip hyphens from tokens in the provided `Text`.
            This is recommended in languages that allow concatenating words
            with or without hyphen (such as German), as arbitrary hyphenation
            in the underlying text may lead to false negatives.
        consider_fallback_tags : bool (optional)
            Whether to consider alternative POS tags in cases where the
            language-specific `BaseAnalyzer` implementation supports it. This
            option is useful for cases where automated Part-of-Speech taggers
            struggle to discern between certain tags (e.g. "NOUN" and "PROPN")
            and tagging performance is less reliable in these cases.

        Returns
        -------
        A list of tokens contained in the `Text` that match the provided details
        about the searched-for word. If no tokens in the `Text` match, returns
        None.
        """
        matches: list[TokenProtocol] = []
        for sentence in self:
            sent_matches = sentence.find(
                word=word,
                pos=pos,
                dehyphenate=dehyphenate,
                consider_fallback_tags=True,
            )
            if sent_matches is None:
                continue
            matches.extend(sent_matches)
        if matches:
            return matches
        return None

sentences property

A list of Sentence objects contained in the provided text.

barriers property

All barriers contained in the Text, as detected by the Analyzer attached to this Text.

local_complexities property

A list of syntactically coherent phrases that constitute the given sentence, as well as their respective calculated syntactic complexities. You can sum the local complexities to get a sound heuristic for the complexity of the complete sentence.

__init__(raw, analyzer, parser)

Create a Text object.

Parameters:

Name Type Description Default
raw str

The string to be used as the bases if the text.

required
analyzer BaseAnalyzer

The BaseAnalyzer used to perform barrier detection. This object contains the barrier analysis and complexity analysis logic.

required
parser Parser

The Parser to be used for parsing relevant morphosyntactic information of the text.

required
Source code in src/limes/text.py
def __init__(
    self,
    raw: str,
    analyzer: BaseAnalyzer,
    parser: Parser,
):
    """
    Create a `Text` object.

    Parameters
    ----------
    raw : str
        The string to be used as the bases if the text.
    analyzer : BaseAnalyzer
        The `BaseAnalyzer` used to perform barrier detection. This object
        contains the barrier analysis and complexity analysis logic.
    parser : Parser
        The `Parser` to be used for parsing relevant morphosyntactic
        information of the text.
    """
    self._raw = raw
    self._analyzer = analyzer
    self._parser = parser
    self._sentences: list[Sentence] | None = None

__str__()

Return the raw text of the Text object.

Source code in src/limes/text.py
def __str__(self) -> str:
    """Return the raw text of the Text object."""
    return self._raw

__iter__()

Iterate over all Sentences contained in the given text. The applied sentencization logic to split the Text into Sentence objects is determined by the Parser with which this Text was initialized.

Source code in src/limes/text.py
def __iter__(self):
    """
    Iterate over all `Sentence`s contained in the given text. The applied
    sentencization logic to split the `Text` into `Sentence` objects is
    determined by the `Parser` with which this `Text` was initialized.
    """
    yield from self.sentences

__getitem__(i)

Return the i-th Sentence in the given text. The applied sentencization logic to split the Text into Sentence objects is determined by the Parser with which this Text was initialized.

Source code in src/limes/text.py
def __getitem__(self, i: int) -> Sentence:
    """
    Return the i-th `Sentence` in the given text. The applied sentencization
    logic to split the `Text` into `Sentence` objects is determined by the
    `Parser` with which this `Text` was initialized.
    """
    return self.sentences[i]

__len__()

The length of the provided Text as determined by the number of Sentences it contains.

Source code in src/limes/text.py
def __len__(self) -> int:
    """
    The length of the provided `Text` as determined by the number of
    `Sentence`s it contains.
    """
    return len(self.sentences)

average_complexity(heuristic=ComplexityAlgorithm.AGGREGATED_LOCAL)

The complexity of the Text as a whole.

Parameters:

Name Type Description Default
heuristic ComplexityAlgorithm

Determines which heuristic to use to calculate the complexity.

AGGREGATED_LOCAL
Source code in src/limes/text.py
def average_complexity(
    self,
    heuristic: ComplexityAlgorithm = ComplexityAlgorithm.AGGREGATED_LOCAL,
) -> float:
    """
    The complexity of the `Text` as a whole.

    Parameters
    ----------
    heuristic : ComplexityAlgorithm
        Determines which heuristic to use to calculate the complexity.
    """
    complexities = [sent.global_complexity(heuristic) for sent in self]
    return float(np.mean(complexities))

find(word, pos, dehyphenate=False, consider_fallback_tags=True)

All instances of the given word contained in the text, if any.

Parameters:

Name Type Description Default
word str

The word to be searched for. You can pass in surface forms of words but using its lemma is suggested to reduce false negatives.

required
pos PartOfSpeechTag

The part-of-speech tag of the word you are looking for.

required
dehyphenate bool(Optional)

Whether or not to strip hyphens from tokens in the provided Text. This is recommended in languages that allow concatenating words with or without hyphen (such as German), as arbitrary hyphenation in the underlying text may lead to false negatives.

False
consider_fallback_tags bool(optional)

Whether to consider alternative POS tags in cases where the language-specific BaseAnalyzer implementation supports it. This option is useful for cases where automated Part-of-Speech taggers struggle to discern between certain tags (e.g. "NOUN" and "PROPN") and tagging performance is less reliable in these cases.

True

Returns:

Type Description
A list of tokens contained in the `Text` that match the provided details
about the searched-for word. If no tokens in the `Text` match, returns
None.
Source code in src/limes/text.py
def find(
    self,
    word: str,
    pos: PartOfSpeechTag,
    dehyphenate: bool = False,
    consider_fallback_tags: bool = True,
) -> list[TokenProtocol] | None:
    """
    All instances of the given word contained in the text, if any.

    Parameters
    ----------
    word : str
        The word to be searched for. You can pass in surface forms of words
        but using its lemma is suggested to reduce false negatives.
    pos : PartOfSpeechTag
        The part-of-speech tag of the word you are looking for.
    dehyphenate : bool (Optional)
        Whether or not to strip hyphens from tokens in the provided `Text`.
        This is recommended in languages that allow concatenating words
        with or without hyphen (such as German), as arbitrary hyphenation
        in the underlying text may lead to false negatives.
    consider_fallback_tags : bool (optional)
        Whether to consider alternative POS tags in cases where the
        language-specific `BaseAnalyzer` implementation supports it. This
        option is useful for cases where automated Part-of-Speech taggers
        struggle to discern between certain tags (e.g. "NOUN" and "PROPN")
        and tagging performance is less reliable in these cases.

    Returns
    -------
    A list of tokens contained in the `Text` that match the provided details
    about the searched-for word. If no tokens in the `Text` match, returns
    None.
    """
    matches: list[TokenProtocol] = []
    for sentence in self:
        sent_matches = sentence.find(
            word=word,
            pos=pos,
            dehyphenate=dehyphenate,
            consider_fallback_tags=True,
        )
        if sent_matches is None:
            continue
        matches.extend(sent_matches)
    if matches:
        return matches
    return None