Sentence

A sentence that is broken down into its individual constituents and their associated metadata. This object utilizes caching to avoid performing expensive computations redundantly.

Source code in src/limes/sentence.py

class Sentence:
    """
    A sentence that is broken down into its individual constituents and their
    associated metadata.
    This object utilizes caching to avoid performing expensive computations
    redundantly.
    """

    def __init__(self, sent: DocumentProtocol, analyzer: BaseAnalyzer):
        """
        Create a `Sentence` object.

        Parameters
        ----------
        sent : DocumentProtocol
            A sentence, as parsed by a `Parser` object.
        analyzer : Analyzer
            The analyzer that is to be used for actual barrier analysis.
        """
        self._sent = sent
        self._analyzer = analyzer
        # Cache variables.
        self._barriers: list[Barrier] | None = None
        self._local_complexities: list[tuple[SpanProtocol, float]] | None = None
        self._find_index: dict[
            bool,
            dict[
                tuple[str, str],
                list[TokenProtocol],
            ],
        ] = {}

    def __str__(self) -> str:
        """Return the raw string of the sentence."""
        return self._sent.text

    def __repr__(self) -> str:
        return f"Sentence({self._sent.text})"

    def __iter__(self) -> Iterator[TokenProtocol]:
        """
        Iterate over all `TokenProtocol`s contained in the given sentence. The
        type of returned `TokenProtocol` subclass depends on the `Parser` with
        which this sentence was created.
        """
        yield from self._sent

    def __getitem__(self, i: int) -> TokenProtocol:
        """
        Return the i-th `TokenProtocol` in the given sentence. The type of
        returned `TokenProtocol` subclass depends on the `Parser` with which
        this sentence was created.
        """
        return self._sent[i]

    @property
    def barriers(self) -> list[Barrier] | None:
        """
        All barriers contained in the sentence, as detected by the `Analyzer`
        attached to this sentence.
        """
        if self._barriers is None:
            self._barriers = self._analyzer.detect_barriers(self._sent)
        return self._barriers

    @property
    def local_complexities(self) -> list[tuple[SpanProtocol, float]]:
        """
        A list of syntactically coherent phrases that constitute the given
        sentence, as well as their respective calculated syntactic complexities.
        You can sum the local complexities to get a sound heuristic for the
        complexity of the complete sentence.
        """
        if self._local_complexities is None:
            self._local_complexities = self._analyzer.compute_local_complexities(
                self._sent
            )
        return self._local_complexities

    def global_complexity(
        self,
        heuristic: ComplexityAlgorithm = ComplexityAlgorithm.AGGREGATED_LOCAL,
    ) -> float:
        """
        The complexity of the sentence as a whole.

        Parameters
        ----------
        heuristic : ComplexityAlgorithm
            Determines which heuristic to use to calculate the complexity.
        """
        return self._analyzer.compute_global_complexity(self._sent, heuristic)

    def find(
        self,
        word: str,
        pos: PartOfSpeechTag,
        dehyphenate: bool = False,
        consider_fallback_tags: bool = True,
    ) -> list[TokenProtocol] | None:
        """
        All instances of the given word contained in the `Sentence`, if any. The
        function builds an index on first run, yielding a lookup of O(n).
        Subsequent lookups occur in constant time.

        Parameters
        ----------
        word : str
            The word to be searched for. You can pass in surface forms of words
            but using its lemma is suggested to reduce false negatives.
        pos : PartOfSpeechTag
            The part-of-speech tag of the word you are looking for.
        dehyphenate : bool (Optional)
            Whether or not to strip hyphens from tokens in the provided `Text`.
            This is recommended in languages that allow concatenating words
            with or without hyphen (such as German), as arbitrary hyphenation
            in the underlying text may lead to false negatives.
        consider_fallback_tags : bool (optional)
            Whether to consider alternative POS tags in cases where the
            language-specific `BaseAnalyzer` implementation supports it. This
            option is useful for cases where automated Part-of-Speech taggers
            struggle to discern between certain tags (e.g. "NOUN" and "PROPN")
            and tagging performance is less reliable in these cases.

        Returns
        -------
        A list of tokens contained in the `Sentence` that match the provided
        details about the searched-for word. If no tokens in the `Sentence`
        match, returns None.
        """
        # We lowercase the lemma because sentence processing lowercases lemmas
        # as well. We rely on the part-of-speech tag to avoid false positives
        # that may occur due to lost information during lower-casing.
        word = word.lower()
        if dehyphenate:
            word = word.replace("-", "")
        results: list[TokenProtocol] = []

        if dehyphenate not in self._find_index:
            index: dict[tuple[str, str], list[TokenProtocol]] = {}
            for token in self:
                token_lemma = token.lemma_.lower()
                if dehyphenate:
                    token_lemma = token_lemma.replace("-", "")
                key = (token_lemma, token.pos_)
                index.setdefault(key, []).append(token)
            self._find_index[dehyphenate] = index

        index = self._find_index[dehyphenate]
        matches = index.get((word, pos), [])
        results.extend(matches)

        if consider_fallback_tags:
            for fallback_pos in self._analyzer.get_pos_lookup_fallbacks(pos):
                fallback_matches = index.get((word, fallback_pos))
                if fallback_matches is None:
                    continue
                results.extend(fallback_matches)

        if results:
            return results
        return None

`barriers` `property`

All barriers contained in the sentence, as detected by the Analyzer attached to this sentence.

`local_complexities` `property`

A list of syntactically coherent phrases that constitute the given sentence, as well as their respective calculated syntactic complexities. You can sum the local complexities to get a sound heuristic for the complexity of the complete sentence.

`init(sent, analyzer)`

Create a Sentence object.

Parameters:

Name	Type	Description	Default
`sent`	`DocumentProtocol`	A sentence, as parsed by a `Parser` object.	required
`analyzer`	`Analyzer`	The analyzer that is to be used for actual barrier analysis.	required

Source code in src/limes/sentence.py

def __init__(self, sent: DocumentProtocol, analyzer: BaseAnalyzer):
    """
    Create a `Sentence` object.

    Parameters
    ----------
    sent : DocumentProtocol
        A sentence, as parsed by a `Parser` object.
    analyzer : Analyzer
        The analyzer that is to be used for actual barrier analysis.
    """
    self._sent = sent
    self._analyzer = analyzer
    # Cache variables.
    self._barriers: list[Barrier] | None = None
    self._local_complexities: list[tuple[SpanProtocol, float]] | None = None
    self._find_index: dict[
        bool,
        dict[
            tuple[str, str],
            list[TokenProtocol],
        ],
    ] = {}

`str()`

Return the raw string of the sentence.

Source code in src/limes/sentence.py

def __str__(self) -> str:
    """Return the raw string of the sentence."""
    return self._sent.text

`iter()`

Iterate over all TokenProtocols contained in the given sentence. The type of returned TokenProtocol subclass depends on the Parser with which this sentence was created.

Source code in src/limes/sentence.py

def __iter__(self) -> Iterator[TokenProtocol]:
    """
    Iterate over all `TokenProtocol`s contained in the given sentence. The
    type of returned `TokenProtocol` subclass depends on the `Parser` with
    which this sentence was created.
    """
    yield from self._sent

`getitem(i)`

Return the i-th TokenProtocol in the given sentence. The type of returned TokenProtocol subclass depends on the Parser with which this sentence was created.

Source code in src/limes/sentence.py

def __getitem__(self, i: int) -> TokenProtocol:
    """
    Return the i-th `TokenProtocol` in the given sentence. The type of
    returned `TokenProtocol` subclass depends on the `Parser` with which
    this sentence was created.
    """
    return self._sent[i]

`global_complexity(heuristic=ComplexityAlgorithm.AGGREGATED_LOCAL)`

The complexity of the sentence as a whole.

Parameters:

Name	Type	Description	Default
`heuristic`	`ComplexityAlgorithm`	Determines which heuristic to use to calculate the complexity.	`AGGREGATED_LOCAL`

Source code in src/limes/sentence.py

def global_complexity(
    self,
    heuristic: ComplexityAlgorithm = ComplexityAlgorithm.AGGREGATED_LOCAL,
) -> float:
    """
    The complexity of the sentence as a whole.

    Parameters
    ----------
    heuristic : ComplexityAlgorithm
        Determines which heuristic to use to calculate the complexity.
    """
    return self._analyzer.compute_global_complexity(self._sent, heuristic)

`find(word, pos, dehyphenate=False, consider_fallback_tags=True)`

All instances of the given word contained in the Sentence, if any. The function builds an index on first run, yielding a lookup of O(n). Subsequent lookups occur in constant time.

Parameters:

Name	Type	Description	Default
`word`	`str`	The word to be searched for. You can pass in surface forms of words but using its lemma is suggested to reduce false negatives.	required
`pos`	`PartOfSpeechTag`	The part-of-speech tag of the word you are looking for.	required
`dehyphenate`	`bool(Optional)`	Whether or not to strip hyphens from tokens in the provided `Text`. This is recommended in languages that allow concatenating words with or without hyphen (such as German), as arbitrary hyphenation in the underlying text may lead to false negatives.	`False`
`consider_fallback_tags`	`bool(optional)`	Whether to consider alternative POS tags in cases where the language-specific `BaseAnalyzer` implementation supports it. This option is useful for cases where automated Part-of-Speech taggers struggle to discern between certain tags (e.g. "NOUN" and "PROPN") and tagging performance is less reliable in these cases.	`True`

Returns:

Type	Description
A list of tokens contained in the `Sentence` that match the provided
details about the searched-for word. If no tokens in the `Sentence`
`match, returns None.`

Source code in src/limes/sentence.py

def find(
    self,
    word: str,
    pos: PartOfSpeechTag,
    dehyphenate: bool = False,
    consider_fallback_tags: bool = True,
) -> list[TokenProtocol] | None:
    """
    All instances of the given word contained in the `Sentence`, if any. The
    function builds an index on first run, yielding a lookup of O(n).
    Subsequent lookups occur in constant time.

    Parameters
    ----------
    word : str
        The word to be searched for. You can pass in surface forms of words
        but using its lemma is suggested to reduce false negatives.
    pos : PartOfSpeechTag
        The part-of-speech tag of the word you are looking for.
    dehyphenate : bool (Optional)
        Whether or not to strip hyphens from tokens in the provided `Text`.
        This is recommended in languages that allow concatenating words
        with or without hyphen (such as German), as arbitrary hyphenation
        in the underlying text may lead to false negatives.
    consider_fallback_tags : bool (optional)
        Whether to consider alternative POS tags in cases where the
        language-specific `BaseAnalyzer` implementation supports it. This
        option is useful for cases where automated Part-of-Speech taggers
        struggle to discern between certain tags (e.g. "NOUN" and "PROPN")
        and tagging performance is less reliable in these cases.

    Returns
    -------
    A list of tokens contained in the `Sentence` that match the provided
    details about the searched-for word. If no tokens in the `Sentence`
    match, returns None.
    """
    # We lowercase the lemma because sentence processing lowercases lemmas
    # as well. We rely on the part-of-speech tag to avoid false positives
    # that may occur due to lost information during lower-casing.
    word = word.lower()
    if dehyphenate:
        word = word.replace("-", "")
    results: list[TokenProtocol] = []

    if dehyphenate not in self._find_index:
        index: dict[tuple[str, str], list[TokenProtocol]] = {}
        for token in self:
            token_lemma = token.lemma_.lower()
            if dehyphenate:
                token_lemma = token_lemma.replace("-", "")
            key = (token_lemma, token.pos_)
            index.setdefault(key, []).append(token)
        self._find_index[dehyphenate] = index

    index = self._find_index[dehyphenate]
    matches = index.get((word, pos), [])
    results.extend(matches)

    if consider_fallback_tags:
        for fallback_pos in self._analyzer.get_pos_lookup_fallbacks(pos):
            fallback_matches = index.get((word, fallback_pos))
            if fallback_matches is None:
                continue
            results.extend(fallback_matches)

    if results:
        return results
    return None

Sentence

barriers property

local_complexities property

__init__(sent, analyzer)

__str__()

__iter__()

__getitem__(i)

global_complexity(heuristic=ComplexityAlgorithm.AGGREGATED_LOCAL)

find(word, pos, dehyphenate=False, consider_fallback_tags=True)

`barriers` `property`

`local_complexities` `property`

`init(sent, analyzer)`

`str()`

`iter()`

`getitem(i)`

`global_complexity(heuristic=ComplexityAlgorithm.AGGREGATED_LOCAL)`

`find(word, pos, dehyphenate=False, consider_fallback_tags=True)`