Skip to content

Commit 6f796fb

Browse files
authored
Merge pull request #18560 from Yoast/LR-10-de-sentence-detection-is-incorrect-in-sentences-containing-ordinal-numbers
LR-10 Improves sentence recognition for German with ordinal numbers
2 parents ed4e472 + c7e7313 commit 6f796fb

8 files changed

Lines changed: 129 additions & 7 deletions

File tree

packages/yoastseo/spec/fullTextTests/testTexts/de/germanPaper.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,17 +113,17 @@ const expectedResults = {
113113
textSentenceLength: {
114114
isApplicable: true,
115115
score: 6,
116-
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 26.5% of the sentences contain more than 20 words, which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
116+
resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 29.5% of the sentences contain more than 20 words, which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
117117
},
118118
textTransitionWords: {
119119
isApplicable: true,
120120
score: 6,
121-
resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 26.4% of the sentences contain transition words, which is not enough. <a href='https://yoa.st/35a' target='_blank'>Use more of them</a>.",
121+
resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 27.1% of the sentences contain transition words, which is not enough. <a href='https://yoa.st/35a' target='_blank'>Use more of them</a>.",
122122
},
123123
passiveVoice: {
124124
isApplicable: true,
125125
score: 3,
126-
resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 15.1% of the sentences contain passive voice, which is more than the recommended maximum of 10%. <a href='https://yoa.st/34u' target='_blank'>Try to use their active counterparts</a>.",
126+
resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 16.7% of the sentences contain passive voice, which is more than the recommended maximum of 10%. <a href='https://yoa.st/34u' target='_blank'>Try to use their active counterparts</a>.",
127127
},
128128
textPresence: {
129129
isApplicable: true,

packages/yoastseo/spec/languageProcessing/helpers/sentence/SentenceTokenizerSpec.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,5 +354,9 @@ describe( "A test for tokenizing a (html) text into sentences", function() {
354354

355355
expect( mockTokenizer.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ).toBeTruthy();
356356
} );
357+
358+
it( "endsWithOrdinalDot should return false when the German tokenizer is not used", () => {
359+
expect( mockTokenizer.endsWithOrdinalDot( "Anything you want to put here, it shouldn't matter." ) ).toBe( false );
360+
} );
357361
} );
358362

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import GermanSentenceTokenizer from "../../../../../../src/languageProcessing/languages/de/helpers/internal/SentenceTokenizer";
2+
3+
4+
const sentenceTokenizer = new GermanSentenceTokenizer();
5+
6+
describe( "Test German extension to sentence tokenizer", () =>{
7+
it( "Correctly tokenizes a sentence with a German ordinal.", () =>{
8+
const tokens = [
9+
{ type: "sentence", src: "In den 66" },
10+
{ type: "full-stop", src: "." },
11+
{ type: "sentence", src: " Club der Stadt wird nachts getanzt" },
12+
{ type: "full-stop", src: "." },
13+
14+
];
15+
expect( sentenceTokenizer.getSentencesFromTokens( tokens )[ 0 ] ).toBe( "In den 66. Club der Stadt wird nachts getanzt." );
16+
} );
17+
18+
it( "Recognizes when a full-stop is part of a German ordinal with 1 digits.", () =>{
19+
const currentSentence = "In den 1.";
20+
expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
21+
} );
22+
23+
it( "Recognizes when a full-stop is part of a German ordinal with 2 digits.", () =>{
24+
const currentSentence = "In den 12.";
25+
expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
26+
} );
27+
28+
it( "Recognizes when a full-stop is part of a German ordinal with 3 digits.", () =>{
29+
const currentSentence = "In den 123.";
30+
expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
31+
} );
32+
33+
xit( "Does not recognize when a full-stop is part of a German ordinal with 4 digits (or more). This is by design.", () =>{
34+
const currentSentence = "In den 1234.";
35+
expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
36+
} );
37+
38+
it( "Recognizes when a full-stop is NOT part of a German ordinal.", () =>{
39+
const currentSentence = "In den 12. Club der Stadt wird nachts getanzt.";
40+
expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( false );
41+
} );
42+
} );

packages/yoastseo/src/languageProcessing/helpers/sentence/SentenceTokenizer.js

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ const blockEndRegex = /^\s*[\])}]\s*$/;
2525
const abbreviationsPreparedForRegex = abbreviations.map( ( abbreviation ) => abbreviation.replace( ".", "\\." ) );
2626
const abbreviationsRegex = createRegexFromArray( abbreviationsPreparedForRegex );
2727

28-
const wordBoundariesForRegex = "[" + wordBoundaries().map( ( boundary ) => "\\" + boundary ).join( "" ) + "]";
28+
const wordBoundariesForRegex = "(^|[" + wordBoundaries().map( ( boundary ) => "\\" + boundary ).join( "" ) + "])";
2929
const lastCharacterPartOfInitialsRegex = new RegExp( wordBoundariesForRegex + "[A-Za-z]$" );
3030

3131
/**
@@ -87,6 +87,15 @@ export default class SentenceTokenizer {
8787
"\"" === character;
8888
}
8989

90+
/**
91+
* A mock definition of this function. This function is only used in extensions for languages that use an ordinal dot.
92+
*
93+
* @returns {boolean} Always returns false as it is a language specific implementation if a language has an ordinal dot.
94+
*/
95+
endsWithOrdinalDot() {
96+
return false;
97+
}
98+
9099
/**
91100
* Returns whether or not a given character is a punctuation mark that can be at the beginning
92101
* of a sentence, like ¿ and ¡ used in Spanish.
@@ -495,6 +504,14 @@ export default class SentenceTokenizer {
495504
if ( this.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ) {
496505
break;
497506
}
507+
508+
// If the full stop is an ordinal dot (in German), then don't break the sentence.
509+
// This check should be done after hasNextSentence && this.isNumber( nextCharacters[ 0 ] ) (above).
510+
// Because otherwise it could break before that test.
511+
if ( this.endsWithOrdinalDot( currentSentence ) ) {
512+
break;
513+
}
514+
498515
/*
499516
* Only split on full stop when:
500517
* a) There is a next sentence, and the next character is a valid sentence beginning preceded by a white space, OR

packages/yoastseo/src/languageProcessing/languages/de/Researcher.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import transitionWords from "./config/transitionWords";
99
import twoPartTransitionWords from "./config/twoPartTransitionWords";
1010
import syllables from "./config/syllables.json";
1111
import keyphraseLength from "./config/keyphraseLength";
12+
import memoizedTokenizer from "./helpers/memoizedSentenceTokenizer";
13+
1214

1315
// All helpers
1416
import getClauses from "./helpers/getClauses";
@@ -43,6 +45,7 @@ export default class Researcher extends AbstractResearcher {
4345
getClauses,
4446
getStemmer,
4547
fleschReadingScore,
48+
memoizedTokenizer,
4649
} );
4750
}
4851
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import SentenceTokenizer from "../../../../helpers/sentence/SentenceTokenizer";
2+
import wordBoundaries from "../../../../../config/wordBoundaries";
3+
4+
// The beginning of a string (^) or one of the word boundaries from the wordBoundaries helper.
5+
const wordBoundariesForRegex = "(^|[" + wordBoundaries().map( ( boundary ) => "\\" + boundary ).join( "" ) + "])";
6+
const ordinalDotRegex = new RegExp( wordBoundariesForRegex + "\\d{1,3}\\.$" );
7+
8+
/**
9+
* Class for tokenizing a (html) text into sentences.
10+
*/
11+
export default class GermanSentenceTokenizer extends SentenceTokenizer {
12+
/**
13+
* Constructor
14+
* @constructor
15+
*/
16+
constructor() {
17+
super();
18+
}
19+
20+
/**
21+
* Checks whether a fullstop is an ordinal dot instead of a sentence splitter.
22+
* See: https://en.wikipedia.org/wiki/Ordinal_indicator#Ordinal_dot
23+
*
24+
* @param {string} currentSentence A string ending with a full stop.
25+
* @returns {boolean} Returns true if the full stop is an ordinal dot, false otherwise.
26+
*/
27+
endsWithOrdinalDot( currentSentence ) {
28+
return ordinalDotRegex.test( currentSentence.trim() );
29+
}
30+
}
31+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import { memoize } from "lodash-es";
2+
import SentenceTokenizer from "./internal/SentenceTokenizer";
3+
4+
/**
5+
* Returns the sentences from a certain block.
6+
*
7+
* @param {string} block The HTML inside a HTML block.
8+
* @returns {Array<string>} The list of sentences in the block.
9+
*/
10+
function getSentenceTokenizer( block ) {
11+
const sentenceTokenizer = new SentenceTokenizer();
12+
const { tokenizer, tokens } = sentenceTokenizer.createTokenizer();
13+
sentenceTokenizer.tokenize( tokenizer, block );
14+
const paragraphTagsRegex = new RegExp( "^(<p>|</p>)$" );
15+
/*
16+
* Filter block that contain only paragraph tags. This step is necessary
17+
* since switching between editors might add extra paragraph tags with a new line tag in the end
18+
* that are incorrectly converted into separate blocks.
19+
*/
20+
return ( tokens.length === 0 || paragraphTagsRegex.test( block ) ) ? [] : sentenceTokenizer.getSentencesFromTokens( tokens );
21+
}
22+
23+
export default memoize( getSentenceTokenizer );

packages/yoastseo/src/languageProcessing/researches/getPassiveVoiceResult.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ import { forEach } from "lodash-es";
1616
export const getMorphologicalPassives = function( paper, researcher ) {
1717
const isPassiveSentence = researcher.getHelper( "isPassiveSentence" );
1818
const text = paper.getText();
19-
// It's not necessary to pass the memoized tokenizer from the researcher here, since only Japanese has the language specific tokenizer.
20-
// Passive voice analysis is not supported in Japanese.
19+
// eslint-disable-next-line max-len
20+
// It's not necessary to pass the memoized tokenizer from the researcher here, since only Japanese and German have the language specific tokenizer.
21+
// Passive voice analysis is not supported in Japanese. In German, passive voice is periphrastic.
2122
const sentences = getSentences( text )
2223
.map( function( sentence ) {
2324
return new Sentence( sentence );
@@ -53,7 +54,8 @@ export const getMorphologicalPassives = function( paper, researcher ) {
5354
export const getPeriphrasticPassives = function( paper, researcher ) {
5455
const getClauses = researcher.getHelper( "getClauses" );
5556
const text = paper.getText();
56-
const sentences = getSentences( text )
57+
const memoizedTokenizer = researcher.getHelper( "memoizedTokenizer" );
58+
const sentences = getSentences( text, memoizedTokenizer )
5759
.map( function( sentence ) {
5860
return new Sentence( sentence );
5961
} );

0 commit comments

Comments
 (0)