Merge pull request #18560 from Yoast/LR-10-de-sentence-detection-is-incorrect-in-sentences-containing-ordinal-numbers

mhkuu · web-flow · commit 6f796fb980a2 · 2022-06-15T16:19:03.000+02:00
LR-10 Improves sentence recognition for German with ordinal numbers
diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/de/germanPaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/de/germanPaper.js
@@ -113,17 +113,17 @@ const expectedResults = {
 	textSentenceLength: {
 		isApplicable: true,
 		score: 6,
-		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 26.5% of the sentences contain more than 20 words, which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
+		resultText: "<a href='https://yoa.st/34v' target='_blank'>Sentence length</a>: 29.5% of the sentences contain more than 20 words, which is more than the recommended maximum of 25%. <a href='https://yoa.st/34w' target='_blank'>Try to shorten the sentences</a>.",
 	},
 	textTransitionWords: {
 		isApplicable: true,
 		score: 6,
-		resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 26.4% of the sentences contain transition words, which is not enough. <a href='https://yoa.st/35a' target='_blank'>Use more of them</a>.",
+		resultText: "<a href='https://yoa.st/34z' target='_blank'>Transition words</a>: Only 27.1% of the sentences contain transition words, which is not enough. <a href='https://yoa.st/35a' target='_blank'>Use more of them</a>.",
 	},
 	passiveVoice: {
 		isApplicable: true,
 		score: 3,
-		resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 15.1% of the sentences contain passive voice, which is more than the recommended maximum of 10%. <a href='https://yoa.st/34u' target='_blank'>Try to use their active counterparts</a>.",
+		resultText: "<a href='https://yoa.st/34t' target='_blank'>Passive voice</a>: 16.7% of the sentences contain passive voice, which is more than the recommended maximum of 10%. <a href='https://yoa.st/34u' target='_blank'>Try to use their active counterparts</a>.",
 	},
 	textPresence: {
 		isApplicable: true,
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sentence/SentenceTokenizerSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sentence/SentenceTokenizerSpec.js
@@ -354,5 +354,9 @@ describe( "A test for tokenizing a (html) text into sentences", function() {
 
 		expect( mockTokenizer.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ).toBeTruthy();
 	} );
+
+	it( "endsWithOrdinalDot should return false when the German tokenizer is not used", () => {
+		expect( mockTokenizer.endsWithOrdinalDot( "Anything you want to put here, it shouldn't matter." ) ).toBe( false );
+	} );
 } );
 
diff --git a/packages/yoastseo/spec/languageProcessing/languages/de/helpers/internal/GermanSentenceTokenizerSpec.js b/packages/yoastseo/spec/languageProcessing/languages/de/helpers/internal/GermanSentenceTokenizerSpec.js
@@ -0,0 +1,42 @@
+import GermanSentenceTokenizer from "../../../../../../src/languageProcessing/languages/de/helpers/internal/SentenceTokenizer";
+
+
+const sentenceTokenizer = new GermanSentenceTokenizer();
+
+describe( "Test German extension to sentence tokenizer", () =>{
+	it( "Correctly tokenizes a sentence with a German ordinal.", () =>{
+		const tokens = [
+			{ type: "sentence", src: "In den 66" },
+			{ type: "full-stop", src: "." },
+			{ type: "sentence", src: " Club der Stadt wird nachts getanzt" },
+			{ type: "full-stop", src: "." },
+
+		];
+		expect( sentenceTokenizer.getSentencesFromTokens( tokens )[ 0 ] ).toBe( "In den 66. Club der Stadt wird nachts getanzt." );
+	} );
+
+	it( "Recognizes when a full-stop is part of a German ordinal with 1 digits.", () =>{
+		const currentSentence = "In den 1.";
+		expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
+	} );
+
+	it( "Recognizes when a full-stop is part of a German ordinal with 2 digits.", () =>{
+		const currentSentence = "In den 12.";
+		expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
+	} );
+
+	it( "Recognizes when a full-stop is part of a German ordinal with 3 digits.", () =>{
+		const currentSentence = "In den 123.";
+		expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
+	} );
+
+	xit( "Does not recognize when a full-stop is part of a German ordinal with 4 digits (or more). This is by design.", () =>{
+		const currentSentence = "In den 1234.";
+		expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( true );
+	} );
+
+	it( "Recognizes when a full-stop is NOT part of a German ordinal.", () =>{
+		const currentSentence = "In den 12. Club der Stadt wird nachts getanzt.";
+		expect( sentenceTokenizer.endsWithOrdinalDot( currentSentence ) ).toBe( false );
+	} );
+} );
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sentence/SentenceTokenizer.js b/packages/yoastseo/src/languageProcessing/helpers/sentence/SentenceTokenizer.js
@@ -25,7 +25,7 @@ const blockEndRegex = /^\s*[\])}]\s*$/;
 const abbreviationsPreparedForRegex = abbreviations.map( ( abbreviation ) => abbreviation.replace( ".", "\\." ) );
 const abbreviationsRegex = createRegexFromArray( abbreviationsPreparedForRegex );
 
-const wordBoundariesForRegex = "[" + wordBoundaries().map( ( boundary ) => "\\" + boundary ).join( "" ) + "]";
+const wordBoundariesForRegex = "(^|[" + wordBoundaries().map( ( boundary ) => "\\" + boundary ).join( "" ) + "])";
 const lastCharacterPartOfInitialsRegex = new RegExp( wordBoundariesForRegex + "[A-Za-z]$" );
 
 /**
@@ -87,6 +87,15 @@ export default class SentenceTokenizer {
 			"\"" === character;
 	}
 
+	/**
+	 * A mock definition of this function. This function is only used in extensions for languages that use an ordinal dot.
+	 *
+	 * @returns {boolean} Always returns false as it is a language specific implementation if a language has an ordinal dot.
+	 */
+	endsWithOrdinalDot() {
+		return false;
+	}
+
 	/**
 	 * Returns whether or not a given character is a punctuation mark that can be at the beginning
 	 * of a sentence, like ¿ and ¡ used in Spanish.
@@ -495,6 +504,14 @@ export default class SentenceTokenizer {
 					if ( this.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ) {
 						break;
 					}
+
+					// If the full stop is an ordinal dot (in German), then don't break the sentence.
+					// This check should be done after  hasNextSentence && this.isNumber( nextCharacters[ 0 ] ) (above).
+					// Because otherwise it could break before that test.
+					if ( this.endsWithOrdinalDot( currentSentence ) ) {
+						break;
+					}
+
 					/*
 					 * Only split on full stop when:
 					 * a) There is a next sentence, and the next character is a valid sentence beginning preceded by a white space, OR
diff --git a/packages/yoastseo/src/languageProcessing/languages/de/Researcher.js b/packages/yoastseo/src/languageProcessing/languages/de/Researcher.js
@@ -9,6 +9,8 @@ import transitionWords from "./config/transitionWords";
 import twoPartTransitionWords from "./config/twoPartTransitionWords";
 import syllables from "./config/syllables.json";
 import keyphraseLength from "./config/keyphraseLength";
+import memoizedTokenizer from "./helpers/memoizedSentenceTokenizer";
+
 
 // All helpers
 import getClauses from "./helpers/getClauses";
@@ -43,6 +45,7 @@ export default class Researcher extends AbstractResearcher {
 			getClauses,
 			getStemmer,
 			fleschReadingScore,
+			memoizedTokenizer,
 		} );
 	}
 }
diff --git a/packages/yoastseo/src/languageProcessing/languages/de/helpers/internal/SentenceTokenizer.js b/packages/yoastseo/src/languageProcessing/languages/de/helpers/internal/SentenceTokenizer.js
@@ -0,0 +1,31 @@
+import SentenceTokenizer from "../../../../helpers/sentence/SentenceTokenizer";
+import wordBoundaries from "../../../../../config/wordBoundaries";
+
+// The beginning of a string (^) or one of the word boundaries from the wordBoundaries helper.
+const wordBoundariesForRegex = "(^|[" + wordBoundaries().map( ( boundary ) => "\\" + boundary ).join( "" ) + "])";
+const ordinalDotRegex = new RegExp( wordBoundariesForRegex + "\\d{1,3}\\.$" );
+
+/**
+ * Class for tokenizing a (html) text into sentences.
+ */
+export default class GermanSentenceTokenizer extends SentenceTokenizer {
+	/**
+	 * Constructor
+	 * @constructor
+	 */
+	constructor() {
+		super();
+	}
+
+	/**
+	 * Checks whether a fullstop is an ordinal dot instead of a sentence splitter.
+	 * See: https://en.wikipedia.org/wiki/Ordinal_indicator#Ordinal_dot
+	 *
+	 * @param {string} currentSentence A string ending with a full stop.
+	 * @returns {boolean} Returns true if the full stop is an ordinal dot, false otherwise.
+	 */
+	endsWithOrdinalDot( currentSentence ) {
+		return ordinalDotRegex.test( currentSentence.trim() );
+	}
+}
+
diff --git a/packages/yoastseo/src/languageProcessing/languages/de/helpers/memoizedSentenceTokenizer.js b/packages/yoastseo/src/languageProcessing/languages/de/helpers/memoizedSentenceTokenizer.js
@@ -0,0 +1,23 @@
+import { memoize } from "lodash-es";
+import SentenceTokenizer from "./internal/SentenceTokenizer";
+
+/**
+ * Returns the sentences from a certain block.
+ *
+ * @param {string} block The HTML inside a HTML block.
+ * @returns {Array<string>} The list of sentences in the block.
+ */
+function getSentenceTokenizer( block ) {
+	const sentenceTokenizer = new SentenceTokenizer();
+	const { tokenizer, tokens } = sentenceTokenizer.createTokenizer();
+	sentenceTokenizer.tokenize( tokenizer, block );
+	const paragraphTagsRegex = new RegExp( "^(<p>|</p>)$" );
+	/*
+	 * Filter block that contain only paragraph tags. This step is necessary
+	 * since switching between editors might add extra paragraph tags with a new line tag in the end
+	 * that are incorrectly converted into separate blocks.
+	 */
+	return ( tokens.length === 0 || paragraphTagsRegex.test( block ) ) ? [] : sentenceTokenizer.getSentencesFromTokens( tokens );
+}
+
+export default memoize( getSentenceTokenizer );
diff --git a/packages/yoastseo/src/languageProcessing/researches/getPassiveVoiceResult.js b/packages/yoastseo/src/languageProcessing/researches/getPassiveVoiceResult.js
@@ -16,8 +16,9 @@ import { forEach } from "lodash-es";
 export const getMorphologicalPassives = function( paper, researcher ) {
 	const isPassiveSentence = researcher.getHelper( "isPassiveSentence" );
 	const text = paper.getText();
-	// It's not necessary to pass the memoized tokenizer from the researcher here, since only Japanese has the language specific tokenizer.
-	// Passive voice analysis is not supported in Japanese.
+	// eslint-disable-next-line max-len
+	// It's not necessary to pass the memoized tokenizer from the researcher here, since only Japanese and German have the language specific tokenizer.
+	// Passive voice analysis is not supported in Japanese. In German, passive voice is periphrastic.
 	const sentences = getSentences( text )
 		.map( function( sentence ) {
 			return new Sentence( sentence );
@@ -53,7 +54,8 @@ export const getMorphologicalPassives = function( paper, researcher ) {
 export const getPeriphrasticPassives = function( paper, researcher ) {
 	const getClauses = researcher.getHelper( "getClauses" );
 	const text = paper.getText();
-	const sentences = getSentences( text )
+	const memoizedTokenizer = researcher.getHelper( "memoizedTokenizer" );
+	const sentences = getSentences( text, memoizedTokenizer )
 		.map( function( sentence ) {
 			return new Sentence( sentence );
 		} );