Skip to content

German Barrier Analyzer

GermanBarrierAnalyzer

Bases: BarrierAnalyzer

A BarrierAnalyzer that is specific to the German language.

Please note that this object is a Callable, and provides logic for automatically identifying and sequentially executing all Barrier detection functions, courtesy of its BarrierAnalyzer superclass.

Source code in src/limes/analyzers/de/barrier_analyzer.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
class GermanBarrierAnalyzer(BarrierAnalyzer):
    """
    A `BarrierAnalyzer` that is specific to the German language.

    Please note that this object is a Callable, and provides logic for
    automatically identifying and sequentially executing all Barrier detection
    functions, courtesy of its `BarrierAnalyzer` superclass.
    """

    def __init__(self, lexicon: Lexicon, simplify_explanations: bool = True):
        """
        Parameters
        ----------
        simplify_explanations : bool, optional
            If True, use simplified barrier descriptions; otherwise use
            standard descriptions. Defaults to True.
        """
        super().__init__(simplify_explanations=simplify_explanations)
        self._lexicon = lexicon

    @property
    def supported_barriers(self) -> list[Barrier]:
        """
        A list of all types of barriers that this analyzer can detect.
        """
        return [
            self._make_barrier(barrier.value, affected_tokens=None)
            for barrier in GermanBarrier
        ]

    def detect_foreign_phrases(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify words that might be difficult to understand without being
        necessary.
        """
        foreign_words_instances = []
        for token in sentence:
            lemma = token.lemma_
            for phrase_length, phrases in FOREIGN_WORDS.items():
                if phrase_length == "1":
                    if lemma in phrases:
                        foreign_words_instances.append(
                            self._make_barrier(
                                GermanBarrier.FOREIGN_PHRASE.value,
                                affected_tokens=[token],
                            )
                        )
                    continue
                for phrase in phrases:
                    assert isinstance(phrase, list)
                    current_token = lemma
                    matching_tokens = []
                    for i, component in enumerate(phrase):
                        if component != current_token:
                            break
                        matching_tokens.append(sentence[token.i + i])
                        try:
                            current_token = sentence[token.i + i + 1].lemma_
                        except IndexError:
                            continue
                    if len(matching_tokens) == len(phrase):
                        foreign_words_instances.append(
                            self._make_barrier(
                                GermanBarrier.FOREIGN_PHRASE.value,
                                affected_tokens=matching_tokens,
                            )
                        )
        return foreign_words_instances

    def detect_educational_language_words(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify verbs that are needlessly complicated.
        """
        educational_language_words = []
        for token in sentence:
            if token.pos_ not in ["VERB", "ADJ", "ADV"]:
                continue
            lemma = token.lemma_
            for disallowed_words in EDUCATIONAL_LANGUAGE.values():
                if lemma in disallowed_words:
                    educational_language_words.append(
                        self._make_barrier(
                            GermanBarrier.EDUCATIONAL_LANGUAGE.value,
                            affected_tokens=[token],
                        )
                    )
        return educational_language_words

    def _find_regex_matches(
        self,
        regex_list: list[Pattern],
        sentence: DocumentProtocol,
        compare_against_lemma: bool = True,
        valid_pos: list[str] | None = None,
        valid_deps: list[str] | None = None,
    ) -> list[TokenProtocol]:
        """
        Identify all tokens in the provided sentence for which a regular
        expression in the provided list creates a full match.
        """
        matches = []
        for token in sentence:
            if valid_pos and token.pos_ not in valid_pos:
                continue
            if valid_deps and token.dep_ not in valid_deps:
                continue
            if compare_against_lemma:
                text = token.lemma_
            else:
                text = token.text
            for regex in regex_list:
                if regex.fullmatch(text):
                    matches.append(token)
                    break
        return matches

    def detect_compound_adjective(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify adjectives that contain meaning injected via suffix.
        """
        matches = self._find_regex_matches(
            ADJECTIVE_SUFFIXES, sentence, valid_pos=["ADV", "ADJ"]
        )
        barriers = [
            self._make_barrier(
                GermanBarrier.COMPOUND_ADJECTIVE.value,
                affected_tokens=[token],
            )
            for token in matches
        ]
        return barriers

    def detect_attribute_nouns(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify nouns that encode attributes or processes. This is an indication for
        substantiation.
        """
        attribute_nouns = []
        for token in sentence:
            if token.pos_ != "NOUN":
                continue
            lemma = token.lemma_
            for suffix in NOUN_PROPERTY_SUFFIXES:
                if suffix.fullmatch(lemma):
                    attribute_nouns.append(
                        self._make_barrier(
                            GermanBarrier.ATTRIBUTE_NOUN.value,
                            affected_tokens=[token],
                        )
                    )
                    break
        return attribute_nouns

    def detect_collocational_verb_construct(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify units of meaning that could be broken down into a single verb.
        """
        relevant_pos_tags = {"VERB", "ADP", "NOUN"}
        verb_constructs = []
        for token in sentence:
            if token.dep_ != "cvc":
                continue
            construct_components = [token]
            for ancestor in token.ancestors:
                if ancestor.pos_ in relevant_pos_tags:
                    construct_components.append(ancestor)
                    break
            for child in token.children:
                if child.pos_ in relevant_pos_tags:
                    construct_components.append(child)
                    break
            verb_constructs.append(
                self._make_barrier(
                    GermanBarrier.COLLOCATIONAL_VERB_CONSTRUCT.value,
                    affected_tokens=construct_components,
                )
            )
        return verb_constructs

    def detect_substitute_expression_for_modality(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify phrases that use needlessly complex vocabulary instead of
        simple modal verbs.
        """
        potential_matches = []
        for token in sentence:
            match = None
            if token.pos_ != "VERB":
                continue
            if token.morph.get("VerbForm") != "Inf":
                match = self._detect_modal_phrase_kommen_in_betracht(token)
                if match:
                    potential_matches.append(match)
                    continue
            for detector in [
                self._detect_modal_phrase_in_der_lage,
                self._detect_modal_phrase_erlauben,
            ]:
                match = detector(token, sentence)
                if match:
                    potential_matches.append(match)
                    break
            if match:
                continue
            for detector in [
                self._detect_modal_phrase_lassen,
                self._detect_modal_phrase_sein_haben_zu,
            ]:
                match = detector(token)
                if match:
                    potential_matches.append(match)
                    break

        modal_phrases = []
        for match in potential_matches:
            match.sort()
            modal_phrases.append(
                self._make_barrier(
                    GermanBarrier.MODAL_PHRASE.value,
                    affected_tokens=[sentence[i] for i in match],
                )
            )
        return modal_phrases

    def _detect_modal_phrase_sein_haben_zu(
        self,
        token: TokenProtocol,
    ) -> list[int] | None:
        """
        Identify sequences that match a "sein + zu + Infinitiv" pattern.
        """
        particle_idx = None
        if "zu" not in str(token):
            for child in token.children:
                if child.lemma_ == "zu":
                    particle_idx = child.i
                    break
            if particle_idx is None:
                return None
        for ancestor in token.ancestors:
            if ancestor.lemma_ in ["sein", "haben"]:
                match = [ancestor.i]
                if particle_idx:
                    match.append(particle_idx)
                match.append(token.i)
                return match
        return None

    def _detect_modal_phrase_lassen(
        self,
        token: TokenProtocol,
    ) -> list[int] | None:
        """
        Identify sequences that match a "sich [Infinitiv] lassen" pattern.
        """
        match = []
        for ancestor in token.ancestors:
            if ancestor.lemma_ == "lassen":
                match.append(ancestor.i)
                break
        for child in token.children:
            if child.lemma_ == "sich":
                match.append(child.i)
                break
        if len(match) == 2:
            match.append(token.i)
            return match
        return None

    def _detect_modal_phrase_in_der_lage(
        self,
        token: TokenProtocol,
        sentence: DocumentProtocol,
    ) -> list[int] | None:
        """
        Identify sequences that match a "sich [Infinitiv] lassen" pattern.
        """
        match = []
        for ancestor in token.ancestors:
            if ancestor.lemma_ != "lage":
                continue
            if ancestor.i < 2:
                continue
            if sentence[ancestor.i - 1].lemma_ != "der":
                continue
            if sentence[ancestor.i - 2].lemma_ != "in":
                continue
            match += [ancestor.i - 2, ancestor.i - 1, ancestor.i]
            break
        if not match:
            return None
        if "zu" not in str(token):
            for child in token.children:
                if child.lemma_ == "zu":
                    match.append(child.i)
        match.append(token.i)
        return match

    def _detect_modal_phrase_erlauben(
        self,
        token: TokenProtocol,
        sentence: DocumentProtocol,
    ) -> list[int] | None:
        """
        Identify sequences that match a "sich [Infinitiv] lassen" pattern.
        """
        match = []
        particle_idx = None
        if "zu" not in str(token):
            for child in token.children:
                if child.lemma_ == "zu":
                    particle_idx = child.i
                    break
            if particle_idx is None:
                return None
        for ancestor in token.ancestors:
            if ancestor.lemma_ != "es":
                continue
            if ancestor.i < 1 or sentence[ancestor.i - 1].lemma_ != "erlauben":
                continue
            match = [ancestor.i - 1, ancestor.i]
            if particle_idx:
                match.append(particle_idx)
            match.append(token.i)
            return match
        return None

    def _detect_modal_phrase_kommen_in_betracht(
        self,
        token: TokenProtocol,
    ) -> list[int] | None:
        """
        Identify sequences that match a "sich [Infinitiv] lassen" pattern.
        """
        if token.lemma_ != "kommen":
            return None
        match = None
        for child in token.children:
            if child.lemma_ != "in":
                continue
            for grandchild in child.children:
                if grandchild.lemma_ == "betracht":
                    match = [child.i, grandchild.i, token.i]
                    break
        return match

    def detect_decomposed_verbs(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify verbs that are decomposed into two tokens. An example: the verb
        'einkaufen' might be decomposed into 'kaufe ein'.

        Parameters
        ----------
        sentence : DocumentProtocol
            The sentence to be analyzed.

        Returns
        -------
        A list of Barrier objects, each representing a single decomposed verb.
        Each decomposed verb consists of two parts.
        """
        decomposed_verbs = []
        for token in sentence:
            for child in token.children:
                if child.dep_ != "svp":
                    continue
                decomposed_verbs.append(
                    self._make_barrier(
                        GermanBarrier.DECOMPOSED_VERB.value,
                        affected_tokens=[token, child],
                    )
                )
        return decomposed_verbs

    def detect_passive_voice(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify word couples that make up passive-voice phrasings.

        Parameters
        ----------
        sentence : DocumentProtocol
            The sentence to be analyzed.

        Returns
        -------
        A list of Barrier objects, each representing a single passive-voice
        structure (which always consists of two tokens).
        """
        passive_voice_instances = []
        for token in sentence:
            # Base token needs to have a "clause"-like dependency or be root of
            # the sentence.
            if (
                not (
                    str(token.dep_).endswith("c") and len(str(token.dep_)) == 2
                )
                and token.dep_ != "ROOT"
                and token.pos_ != "AUX"
            ):
                continue
            # Base token needs to be a conjugated form of "werden".
            if token.lemma_ != "werden":
                continue
            # If base conditions are met for given token, check if token has
            # participle child.
            for child in token.children:
                if child.fine_pos in _PARTICIPLE_TAGS:
                    passive_voice_instances.append(
                        self._make_barrier(
                            GermanBarrier.PASSIVE_VOICE.value,
                            affected_tokens=[token, child],
                        )
                    )
        return passive_voice_instances

    def detect_negation(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify cases where words contain a suffix that implies the meaning of
        the word is negated.
        """
        barriers: list[Barrier] = []
        for token in sentence:
            # Only nouns and adjectives may appear negated by affix.
            if token.pos_ == "NOUN":
                pos = "NOUN"
            # We fold adverbs into adjectives because they are functionally
            # similar for these purposes.
            elif token.pos_ in ["ADJ", "ADV"]:
                pos = "ADJ"
            else:
                continue

            # Attempt to decompose the token.
            decomp_comp = self._decompose_negation_compound(token)
            if decomp_comp is None:
                continue

            # Rule out false positives by comparing to a list of knonw issues.
            if (
                token.lemma_
                in NEGATION_RULE_EXCEPTIONS[decomp_comp.type][
                    decomp_comp.affix
                ][pos]
            ):
                continue

            double_negative = self._detect_double_negative(
                token=token,
                sentence=sentence,
            )
            # Double negatives are barriers regardless of lexicalization.
            if double_negative:
                barriers.append(double_negative)
                continue

            # If the token is a stand-alone negative compound, only consider it
            # a barrier if it is not lexicalized.
            if not self._negative_is_lexicalized(
                negative=token.lemma_,
                positive=decomp_comp.stem,
            ):
                barriers.append(
                    self._make_barrier(
                        GermanBarrier.NEGATION_AFFIX.value,
                        affected_tokens=[token],
                    )
                )

        return barriers

    def _decompose_negation_compound(
        self,
        token: TokenProtocol,
    ) -> DecomposedNegationCompound | None:
        """
        Break up a token into affix and stem and categorize the type of affix.
        If the token does not contain any relevant affix, returns None.
        """
        lemma = token.lemma_
        for affix_type, regex_list in NEGATION_AFFIXES.items():
            for regex in regex_list:
                match = regex.fullmatch(lemma)
                if match is None:
                    continue

                stem_str = match.group("stem")
                # If token is noun, the stem of negation compound should also
                # be a noun and therefore capitalized in German.
                # Compound negatives driven by suffixes are often subject to word class
                # changes (e.g. "Zucker" -> "zuckerfrei"); that is why we are looking
                # for capitalized words in this case.
                if token.pos_ == "NOUN" or affix_type == "suffix":
                    stem_str = stem_str.capitalize()

                stem_freq = self._lexicon.get_frequency(stem_str)
                # Stems with no frequency information are not lexicalized and
                # therefore not candidates for negation compounds with the given
                # suffix.
                if stem_freq is None:
                    continue

                return DecomposedNegationCompound(
                    stem=Stem(
                        string=stem_str,
                        freq=stem_freq,
                    ),
                    affix=match.group("affix"),
                    type=affix_type,
                )
        return None

    def _negative_is_lexicalized(
        self,
        negative: str,
        positive: Stem,
        threshold: float = 0.5,
    ) -> bool | None:
        """
        Check whether we can assume lexicalization of the negative based on the
        size of the frequency delta. If the positive is not contained in the
        lexicon, return None.
        """
        freq_neg = self._lexicon.get_frequency(negative)

        # If lexicon doesn't know the negative, we assume it is not lexicalized.
        if freq_neg is None:
            return False

        return freq_neg > threshold * positive.freq

    def _detect_double_negative(
        self,
        token: TokenProtocol,
        sentence: DocumentProtocol,
    ) -> Barrier | None:
        """
        Identify whether a given compound negation is preceeded by a negative,
        implying a double negative. This function assumes that the given token
        is a compound negative.
        """
        # Check left neighbor for negative.
        if token.i > 0:
            neighbor = sentence[token.i - 1]
            if neighbor.lemma_ in ["nicht", "kein", "ohne"]:
                return self._make_barrier(
                    GermanBarrier.DOUBLE_NEGATION.value,
                    affected_tokens=[neighbor, token],
                )
        return None

    def detect_long_sentence(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify sentences that are too long.
        """
        words = [t for t in sentence if t.pos_ != "PUNCT"]
        if len(words) >= 15:
            return [
                self._make_barrier(
                    GermanBarrier.LONG_SENTENCE.value,
                    affected_tokens=words,
                )
            ]
        return []

    def detect_numeral_words(
        self,
        sentence: DocumentProtocol,
    ) -> list[Barrier]:
        """
        Identify whether a given token is a numeral word (e.g. "sechs" rather
        than "6").
        """
        numeral_words = []
        for token in sentence:
            if token.pos_ != "NUM":
                continue
            text = token.text
            if any(char.isdigit() for char in text):
                continue
            if not any(char.isalpha() for char in text):
                continue
            numeral_words.append(
                self._make_barrier(
                    GermanBarrier.NUMERAL_WORD.value,
                    affected_tokens=[token],
                )
            )
        return numeral_words

supported_barriers property

A list of all types of barriers that this analyzer can detect.

__init__(lexicon, simplify_explanations=True)

Parameters:

Name Type Description Default
simplify_explanations bool

If True, use simplified barrier descriptions; otherwise use standard descriptions. Defaults to True.

True
Source code in src/limes/analyzers/de/barrier_analyzer.py
def __init__(self, lexicon: Lexicon, simplify_explanations: bool = True):
    """
    Parameters
    ----------
    simplify_explanations : bool, optional
        If True, use simplified barrier descriptions; otherwise use
        standard descriptions. Defaults to True.
    """
    super().__init__(simplify_explanations=simplify_explanations)
    self._lexicon = lexicon

detect_foreign_phrases(sentence)

Identify words that might be difficult to understand without being necessary.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_foreign_phrases(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify words that might be difficult to understand without being
    necessary.
    """
    foreign_words_instances = []
    for token in sentence:
        lemma = token.lemma_
        for phrase_length, phrases in FOREIGN_WORDS.items():
            if phrase_length == "1":
                if lemma in phrases:
                    foreign_words_instances.append(
                        self._make_barrier(
                            GermanBarrier.FOREIGN_PHRASE.value,
                            affected_tokens=[token],
                        )
                    )
                continue
            for phrase in phrases:
                assert isinstance(phrase, list)
                current_token = lemma
                matching_tokens = []
                for i, component in enumerate(phrase):
                    if component != current_token:
                        break
                    matching_tokens.append(sentence[token.i + i])
                    try:
                        current_token = sentence[token.i + i + 1].lemma_
                    except IndexError:
                        continue
                if len(matching_tokens) == len(phrase):
                    foreign_words_instances.append(
                        self._make_barrier(
                            GermanBarrier.FOREIGN_PHRASE.value,
                            affected_tokens=matching_tokens,
                        )
                    )
    return foreign_words_instances

detect_educational_language_words(sentence)

Identify verbs that are needlessly complicated.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_educational_language_words(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify verbs that are needlessly complicated.
    """
    educational_language_words = []
    for token in sentence:
        if token.pos_ not in ["VERB", "ADJ", "ADV"]:
            continue
        lemma = token.lemma_
        for disallowed_words in EDUCATIONAL_LANGUAGE.values():
            if lemma in disallowed_words:
                educational_language_words.append(
                    self._make_barrier(
                        GermanBarrier.EDUCATIONAL_LANGUAGE.value,
                        affected_tokens=[token],
                    )
                )
    return educational_language_words

detect_compound_adjective(sentence)

Identify adjectives that contain meaning injected via suffix.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_compound_adjective(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify adjectives that contain meaning injected via suffix.
    """
    matches = self._find_regex_matches(
        ADJECTIVE_SUFFIXES, sentence, valid_pos=["ADV", "ADJ"]
    )
    barriers = [
        self._make_barrier(
            GermanBarrier.COMPOUND_ADJECTIVE.value,
            affected_tokens=[token],
        )
        for token in matches
    ]
    return barriers

detect_attribute_nouns(sentence)

Identify nouns that encode attributes or processes. This is an indication for substantiation.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_attribute_nouns(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify nouns that encode attributes or processes. This is an indication for
    substantiation.
    """
    attribute_nouns = []
    for token in sentence:
        if token.pos_ != "NOUN":
            continue
        lemma = token.lemma_
        for suffix in NOUN_PROPERTY_SUFFIXES:
            if suffix.fullmatch(lemma):
                attribute_nouns.append(
                    self._make_barrier(
                        GermanBarrier.ATTRIBUTE_NOUN.value,
                        affected_tokens=[token],
                    )
                )
                break
    return attribute_nouns

detect_collocational_verb_construct(sentence)

Identify units of meaning that could be broken down into a single verb.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_collocational_verb_construct(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify units of meaning that could be broken down into a single verb.
    """
    relevant_pos_tags = {"VERB", "ADP", "NOUN"}
    verb_constructs = []
    for token in sentence:
        if token.dep_ != "cvc":
            continue
        construct_components = [token]
        for ancestor in token.ancestors:
            if ancestor.pos_ in relevant_pos_tags:
                construct_components.append(ancestor)
                break
        for child in token.children:
            if child.pos_ in relevant_pos_tags:
                construct_components.append(child)
                break
        verb_constructs.append(
            self._make_barrier(
                GermanBarrier.COLLOCATIONAL_VERB_CONSTRUCT.value,
                affected_tokens=construct_components,
            )
        )
    return verb_constructs

detect_substitute_expression_for_modality(sentence)

Identify phrases that use needlessly complex vocabulary instead of simple modal verbs.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_substitute_expression_for_modality(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify phrases that use needlessly complex vocabulary instead of
    simple modal verbs.
    """
    potential_matches = []
    for token in sentence:
        match = None
        if token.pos_ != "VERB":
            continue
        if token.morph.get("VerbForm") != "Inf":
            match = self._detect_modal_phrase_kommen_in_betracht(token)
            if match:
                potential_matches.append(match)
                continue
        for detector in [
            self._detect_modal_phrase_in_der_lage,
            self._detect_modal_phrase_erlauben,
        ]:
            match = detector(token, sentence)
            if match:
                potential_matches.append(match)
                break
        if match:
            continue
        for detector in [
            self._detect_modal_phrase_lassen,
            self._detect_modal_phrase_sein_haben_zu,
        ]:
            match = detector(token)
            if match:
                potential_matches.append(match)
                break

    modal_phrases = []
    for match in potential_matches:
        match.sort()
        modal_phrases.append(
            self._make_barrier(
                GermanBarrier.MODAL_PHRASE.value,
                affected_tokens=[sentence[i] for i in match],
            )
        )
    return modal_phrases

detect_decomposed_verbs(sentence)

Identify verbs that are decomposed into two tokens. An example: the verb 'einkaufen' might be decomposed into 'kaufe ein'.

Parameters:

Name Type Description Default
sentence DocumentProtocol

The sentence to be analyzed.

required

Returns:

Type Description
A list of Barrier objects, each representing a single decomposed verb.
Each decomposed verb consists of two parts.
Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_decomposed_verbs(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify verbs that are decomposed into two tokens. An example: the verb
    'einkaufen' might be decomposed into 'kaufe ein'.

    Parameters
    ----------
    sentence : DocumentProtocol
        The sentence to be analyzed.

    Returns
    -------
    A list of Barrier objects, each representing a single decomposed verb.
    Each decomposed verb consists of two parts.
    """
    decomposed_verbs = []
    for token in sentence:
        for child in token.children:
            if child.dep_ != "svp":
                continue
            decomposed_verbs.append(
                self._make_barrier(
                    GermanBarrier.DECOMPOSED_VERB.value,
                    affected_tokens=[token, child],
                )
            )
    return decomposed_verbs

detect_passive_voice(sentence)

Identify word couples that make up passive-voice phrasings.

Parameters:

Name Type Description Default
sentence DocumentProtocol

The sentence to be analyzed.

required

Returns:

Type Description
A list of Barrier objects, each representing a single passive-voice
structure (which always consists of two tokens).
Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_passive_voice(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify word couples that make up passive-voice phrasings.

    Parameters
    ----------
    sentence : DocumentProtocol
        The sentence to be analyzed.

    Returns
    -------
    A list of Barrier objects, each representing a single passive-voice
    structure (which always consists of two tokens).
    """
    passive_voice_instances = []
    for token in sentence:
        # Base token needs to have a "clause"-like dependency or be root of
        # the sentence.
        if (
            not (
                str(token.dep_).endswith("c") and len(str(token.dep_)) == 2
            )
            and token.dep_ != "ROOT"
            and token.pos_ != "AUX"
        ):
            continue
        # Base token needs to be a conjugated form of "werden".
        if token.lemma_ != "werden":
            continue
        # If base conditions are met for given token, check if token has
        # participle child.
        for child in token.children:
            if child.fine_pos in _PARTICIPLE_TAGS:
                passive_voice_instances.append(
                    self._make_barrier(
                        GermanBarrier.PASSIVE_VOICE.value,
                        affected_tokens=[token, child],
                    )
                )
    return passive_voice_instances

detect_negation(sentence)

Identify cases where words contain a suffix that implies the meaning of the word is negated.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_negation(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify cases where words contain a suffix that implies the meaning of
    the word is negated.
    """
    barriers: list[Barrier] = []
    for token in sentence:
        # Only nouns and adjectives may appear negated by affix.
        if token.pos_ == "NOUN":
            pos = "NOUN"
        # We fold adverbs into adjectives because they are functionally
        # similar for these purposes.
        elif token.pos_ in ["ADJ", "ADV"]:
            pos = "ADJ"
        else:
            continue

        # Attempt to decompose the token.
        decomp_comp = self._decompose_negation_compound(token)
        if decomp_comp is None:
            continue

        # Rule out false positives by comparing to a list of knonw issues.
        if (
            token.lemma_
            in NEGATION_RULE_EXCEPTIONS[decomp_comp.type][
                decomp_comp.affix
            ][pos]
        ):
            continue

        double_negative = self._detect_double_negative(
            token=token,
            sentence=sentence,
        )
        # Double negatives are barriers regardless of lexicalization.
        if double_negative:
            barriers.append(double_negative)
            continue

        # If the token is a stand-alone negative compound, only consider it
        # a barrier if it is not lexicalized.
        if not self._negative_is_lexicalized(
            negative=token.lemma_,
            positive=decomp_comp.stem,
        ):
            barriers.append(
                self._make_barrier(
                    GermanBarrier.NEGATION_AFFIX.value,
                    affected_tokens=[token],
                )
            )

    return barriers

detect_long_sentence(sentence)

Identify sentences that are too long.

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_long_sentence(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify sentences that are too long.
    """
    words = [t for t in sentence if t.pos_ != "PUNCT"]
    if len(words) >= 15:
        return [
            self._make_barrier(
                GermanBarrier.LONG_SENTENCE.value,
                affected_tokens=words,
            )
        ]
    return []

detect_numeral_words(sentence)

Identify whether a given token is a numeral word (e.g. "sechs" rather than "6").

Source code in src/limes/analyzers/de/barrier_analyzer.py
def detect_numeral_words(
    self,
    sentence: DocumentProtocol,
) -> list[Barrier]:
    """
    Identify whether a given token is a numeral word (e.g. "sechs" rather
    than "6").
    """
    numeral_words = []
    for token in sentence:
        if token.pos_ != "NUM":
            continue
        text = token.text
        if any(char.isdigit() for char in text):
            continue
        if not any(char.isalpha() for char in text):
            continue
        numeral_words.append(
            self._make_barrier(
                GermanBarrier.NUMERAL_WORD.value,
                affected_tokens=[token],
            )
        )
    return numeral_words