Skip to content

Sentence

A sentence that is broken down into its individual constituents and their associated metadata. This object utilizes caching to avoid performing expensive computations redundantly.

Source code in src/limes/sentence.py
class Sentence:
    """
    A sentence that is broken down into its individual constituents and their
    associated metadata.
    This object utilizes caching to avoid performing expensive computations
    redundantly.
    """

    def __init__(self, sent: DocumentProtocol, analyzer: BaseAnalyzer):
        """
        Create a `Sentence` object.

        Parameters
        ----------
        sent : DocumentProtocol
            A sentence, as parsed by a `Parser` object.
        analyzer : Analyzer
            The analyzer that is to be used for actual barrier analysis.
        """
        self._sent = sent
        self._analyzer = analyzer
        # Cache variables.
        self._barriers: list[Barrier] | None = None
        self._local_complexities: list[tuple[SpanProtocol, float]] | None = None
        self._find_index: dict[
            bool,
            dict[
                tuple[str, str],
                list[TokenProtocol],
            ],
        ] = {}

    def __str__(self) -> str:
        """Return the raw string of the sentence."""
        return self._sent.text

    def __repr__(self) -> str:
        return f"Sentence({self._sent.text})"

    def __iter__(self) -> Iterator[TokenProtocol]:
        """
        Iterate over all `TokenProtocol`s contained in the given sentence. The
        type of returned `TokenProtocol` subclass depends on the `Parser` with
        which this sentence was created.
        """
        yield from self._sent

    def __getitem__(self, i: int) -> TokenProtocol:
        """
        Return the i-th `TokenProtocol` in the given sentence. The type of
        returned `TokenProtocol` subclass depends on the `Parser` with which
        this sentence was created.
        """
        return self._sent[i]

    @property
    def barriers(self) -> list[Barrier] | None:
        """
        All barriers contained in the sentence, as detected by the `Analyzer`
        attached to this sentence.
        """
        if self._barriers is None:
            self._barriers = self._analyzer.detect_barriers(self._sent)
        return self._barriers

    @property
    def local_complexities(self) -> list[tuple[SpanProtocol, float]]:
        """
        A list of syntactically coherent phrases that constitute the given
        sentence, as well as their respective calculated syntactic complexities.
        You can sum the local complexities to get a sound heuristic for the
        complexity of the complete sentence.
        """
        if self._local_complexities is None:
            self._local_complexities = self._analyzer.compute_local_complexities(
                self._sent
            )
        return self._local_complexities

    def global_complexity(
        self,
        heuristic: ComplexityAlgorithm = ComplexityAlgorithm.AGGREGATED_LOCAL,
    ) -> float:
        """
        The complexity of the sentence as a whole.

        Parameters
        ----------
        heuristic : ComplexityAlgorithm
            Determines which heuristic to use to calculate the complexity.
        """
        return self._analyzer.compute_global_complexity(self._sent, heuristic)

    def find(
        self,
        word: str,
        pos: PartOfSpeechTag,
        dehyphenate: bool = False,
        consider_fallback_tags: bool = True,
    ) -> list[TokenProtocol] | None:
        """
        All instances of the given word contained in the `Sentence`, if any. The
        function builds an index on first run, yielding a lookup of O(n).
        Subsequent lookups occur in constant time.

        Parameters
        ----------
        word : str
            The word to be searched for. You can pass in surface forms of words
            but using its lemma is suggested to reduce false negatives.
        pos : PartOfSpeechTag
            The part-of-speech tag of the word you are looking for.
        dehyphenate : bool (Optional)
            Whether or not to strip hyphens from tokens in the provided `Text`.
            This is recommended in languages that allow concatenating words
            with or without hyphen (such as German), as arbitrary hyphenation
            in the underlying text may lead to false negatives.
        consider_fallback_tags : bool (optional)
            Whether to consider alternative POS tags in cases where the
            language-specific `BaseAnalyzer` implementation supports it. This
            option is useful for cases where automated Part-of-Speech taggers
            struggle to discern between certain tags (e.g. "NOUN" and "PROPN")
            and tagging performance is less reliable in these cases.

        Returns
        -------
        A list of tokens contained in the `Sentence` that match the provided
        details about the searched-for word. If no tokens in the `Sentence`
        match, returns None.
        """
        # We lowercase the lemma because sentence processing lowercases lemmas
        # as well. We rely on the part-of-speech tag to avoid false positives
        # that may occur due to lost information during lower-casing.
        word = word.lower()
        if dehyphenate:
            word = word.replace("-", "")
        results: list[TokenProtocol] = []

        if dehyphenate not in self._find_index:
            index: dict[tuple[str, str], list[TokenProtocol]] = {}
            for token in self:
                token_lemma = token.lemma_.lower()
                if dehyphenate:
                    token_lemma = token_lemma.replace("-", "")
                key = (token_lemma, token.pos_)
                index.setdefault(key, []).append(token)
            self._find_index[dehyphenate] = index

        index = self._find_index[dehyphenate]
        matches = index.get((word, pos), [])
        results.extend(matches)

        if consider_fallback_tags:
            for fallback_pos in self._analyzer.get_pos_lookup_fallbacks(pos):
                fallback_matches = index.get((word, fallback_pos))
                if fallback_matches is None:
                    continue
                results.extend(fallback_matches)

        if results:
            return results
        return None

barriers property

All barriers contained in the sentence, as detected by the Analyzer attached to this sentence.

local_complexities property

A list of syntactically coherent phrases that constitute the given sentence, as well as their respective calculated syntactic complexities. You can sum the local complexities to get a sound heuristic for the complexity of the complete sentence.

__init__(sent, analyzer)

Create a Sentence object.

Parameters:

Name Type Description Default
sent DocumentProtocol

A sentence, as parsed by a Parser object.

required
analyzer Analyzer

The analyzer that is to be used for actual barrier analysis.

required
Source code in src/limes/sentence.py
def __init__(self, sent: DocumentProtocol, analyzer: BaseAnalyzer):
    """
    Create a `Sentence` object.

    Parameters
    ----------
    sent : DocumentProtocol
        A sentence, as parsed by a `Parser` object.
    analyzer : Analyzer
        The analyzer that is to be used for actual barrier analysis.
    """
    self._sent = sent
    self._analyzer = analyzer
    # Cache variables.
    self._barriers: list[Barrier] | None = None
    self._local_complexities: list[tuple[SpanProtocol, float]] | None = None
    self._find_index: dict[
        bool,
        dict[
            tuple[str, str],
            list[TokenProtocol],
        ],
    ] = {}

__str__()

Return the raw string of the sentence.

Source code in src/limes/sentence.py
def __str__(self) -> str:
    """Return the raw string of the sentence."""
    return self._sent.text

__iter__()

Iterate over all TokenProtocols contained in the given sentence. The type of returned TokenProtocol subclass depends on the Parser with which this sentence was created.

Source code in src/limes/sentence.py
def __iter__(self) -> Iterator[TokenProtocol]:
    """
    Iterate over all `TokenProtocol`s contained in the given sentence. The
    type of returned `TokenProtocol` subclass depends on the `Parser` with
    which this sentence was created.
    """
    yield from self._sent

__getitem__(i)

Return the i-th TokenProtocol in the given sentence. The type of returned TokenProtocol subclass depends on the Parser with which this sentence was created.

Source code in src/limes/sentence.py
def __getitem__(self, i: int) -> TokenProtocol:
    """
    Return the i-th `TokenProtocol` in the given sentence. The type of
    returned `TokenProtocol` subclass depends on the `Parser` with which
    this sentence was created.
    """
    return self._sent[i]

global_complexity(heuristic=ComplexityAlgorithm.AGGREGATED_LOCAL)

The complexity of the sentence as a whole.

Parameters:

Name Type Description Default
heuristic ComplexityAlgorithm

Determines which heuristic to use to calculate the complexity.

AGGREGATED_LOCAL
Source code in src/limes/sentence.py
def global_complexity(
    self,
    heuristic: ComplexityAlgorithm = ComplexityAlgorithm.AGGREGATED_LOCAL,
) -> float:
    """
    The complexity of the sentence as a whole.

    Parameters
    ----------
    heuristic : ComplexityAlgorithm
        Determines which heuristic to use to calculate the complexity.
    """
    return self._analyzer.compute_global_complexity(self._sent, heuristic)

find(word, pos, dehyphenate=False, consider_fallback_tags=True)

All instances of the given word contained in the Sentence, if any. The function builds an index on first run, yielding a lookup of O(n). Subsequent lookups occur in constant time.

Parameters:

Name Type Description Default
word str

The word to be searched for. You can pass in surface forms of words but using its lemma is suggested to reduce false negatives.

required
pos PartOfSpeechTag

The part-of-speech tag of the word you are looking for.

required
dehyphenate bool(Optional)

Whether or not to strip hyphens from tokens in the provided Text. This is recommended in languages that allow concatenating words with or without hyphen (such as German), as arbitrary hyphenation in the underlying text may lead to false negatives.

False
consider_fallback_tags bool(optional)

Whether to consider alternative POS tags in cases where the language-specific BaseAnalyzer implementation supports it. This option is useful for cases where automated Part-of-Speech taggers struggle to discern between certain tags (e.g. "NOUN" and "PROPN") and tagging performance is less reliable in these cases.

True

Returns:

Type Description
A list of tokens contained in the `Sentence` that match the provided
details about the searched-for word. If no tokens in the `Sentence`
match, returns None.
Source code in src/limes/sentence.py
def find(
    self,
    word: str,
    pos: PartOfSpeechTag,
    dehyphenate: bool = False,
    consider_fallback_tags: bool = True,
) -> list[TokenProtocol] | None:
    """
    All instances of the given word contained in the `Sentence`, if any. The
    function builds an index on first run, yielding a lookup of O(n).
    Subsequent lookups occur in constant time.

    Parameters
    ----------
    word : str
        The word to be searched for. You can pass in surface forms of words
        but using its lemma is suggested to reduce false negatives.
    pos : PartOfSpeechTag
        The part-of-speech tag of the word you are looking for.
    dehyphenate : bool (Optional)
        Whether or not to strip hyphens from tokens in the provided `Text`.
        This is recommended in languages that allow concatenating words
        with or without hyphen (such as German), as arbitrary hyphenation
        in the underlying text may lead to false negatives.
    consider_fallback_tags : bool (optional)
        Whether to consider alternative POS tags in cases where the
        language-specific `BaseAnalyzer` implementation supports it. This
        option is useful for cases where automated Part-of-Speech taggers
        struggle to discern between certain tags (e.g. "NOUN" and "PROPN")
        and tagging performance is less reliable in these cases.

    Returns
    -------
    A list of tokens contained in the `Sentence` that match the provided
    details about the searched-for word. If no tokens in the `Sentence`
    match, returns None.
    """
    # We lowercase the lemma because sentence processing lowercases lemmas
    # as well. We rely on the part-of-speech tag to avoid false positives
    # that may occur due to lost information during lower-casing.
    word = word.lower()
    if dehyphenate:
        word = word.replace("-", "")
    results: list[TokenProtocol] = []

    if dehyphenate not in self._find_index:
        index: dict[tuple[str, str], list[TokenProtocol]] = {}
        for token in self:
            token_lemma = token.lemma_.lower()
            if dehyphenate:
                token_lemma = token_lemma.replace("-", "")
            key = (token_lemma, token.pos_)
            index.setdefault(key, []).append(token)
        self._find_index[dehyphenate] = index

    index = self._find_index[dehyphenate]
    matches = index.get((word, pos), [])
    results.extend(matches)

    if consider_fallback_tags:
        for fallback_pos in self._analyzer.get_pos_lookup_fallbacks(pos):
            fallback_matches = index.get((word, fallback_pos))
            if fallback_matches is None:
                continue
            results.extend(fallback_matches)

    if results:
        return results
    return None