Yoast · FAMarfuaty · Dec 6, 2022 · Nov 3, 2022 · Nov 7, 2022 · Nov 7, 2022
diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
@@ -66,7 +66,7 @@ const expectedResults = {
 	textLength: {
 		isApplicable: true,
 		score: 9,
-		resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3165 characters. Good job!",
+		resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3022 characters. Good job!",
 	},
 	externalLinks: {
 		isApplicable: true,

diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
@@ -0,0 +1,19 @@
+import removeEmailAddresses from "../../../../src/languageProcessing/helpers/sanitize/removeEmailAddresses.js";
+
+describe( "a test for removing email addresses from a string", function() {
+	it( "removes an email address", function() {
+		expect( removeEmailAddresses( "example@something.com" ) ).toBe( "" );
+	} );
+	it( "removes an email address with special characters", function() {
+		expect( removeEmailAddresses( "some+long+email+address23@some+host-weird-/looking.com" ) ).toBe( "" );
+	} );
+	it( "removes a very short email address", function() {
+		expect( removeEmailAddresses( "a@b.com" ) ).toBe( "" );
+	} );
+	it( "does not remove invalid email addresses", function() {
+		expect( removeEmailAddresses( "@b.com" ) ).toBe( "@b.com" );
+		expect( removeEmailAddresses( "a@b" ) ).toBe( "a@b" );
+		expect( removeEmailAddresses( "example@" ) ).toBe( "example@" );
+		expect( removeEmailAddresses( "example.com" ) ).toBe( "example.com" );
+	} );
+} );
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
@@ -0,0 +1,52 @@
+import removeURLs from "../../../../src/languageProcessing/helpers/sanitize/removeURLs.js";
+
+describe( "a test for removing URLs from a string", function() {
+	it( "removes a base URL", function() {
+		expect( removeURLs( "https://example.com" ) ).toBe( "" );
+	} );
+	it( "removes a URL followed by a subdirectory", function() {
+		expect( removeURLs( "https://example.com/example1" ) ).toBe( "" );
+	} );
+	it( "removes a URL followed by multiple subdirectories", function() {
+		expect( removeURLs( "https://example.com/example1/part1" ) ).toBe( "" );
+	} );
+	it( "removes a URL with a subdomain", function() {
+		expect( removeURLs( "https://blog.example.com/examples" ) ).toBe( "" );
+	} );
+	it( "removes a URL starting with http://", function() {
+		expect( removeURLs( "http://blog.example.com/examples" ) ).toBe( "" );
+	} );
+	it( "removes a URL containing www.", function() {
+		expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" );
+	} );
+	it( "removes a URL starting with www.", function() {
+		expect( removeURLs( "www.blog.example.com/examples" ) ).toBe( "" );
+	} );
+	it( "removes a URL starting with ftp", function() {
+		expect( removeURLs( "ftp://example.com" ) ).toBe( "" );
+	} );
+	it( "removes a URL containing special characters.", function() {
+		expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
+	} );
+	it( "removes a URL containing a semi-colon.", function() {
+		expect( removeURLs( "https://www.example.com/foo/?bar=baz&amp;inga=42&amp;quux" ) ).toBe( "" );
+	} );
+	it( "removes a URL containing more special characters.", function() {
+		expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" );
+	} );
+	it( "removes a URL with a different top-level domain", function() {
+		expect( removeURLs( "http://example.co.uk" ) ).toBe( "" );
+	} );
+	it( "removes a URL followed by Japanese characters", function() {
+		expect( removeURLs( "https://example.comこれに対し日本国有鉄道" ) ).toBe( "これに対し日本国有鉄道" );
+	} );
+	it( "does not remove a URL that doesn't start with 'http(s)://', 'ftp://' or 'www'.", function() {
+		expect( removeURLs( "example.com" ) ).toBe( "example.com" );
+	} );
+	it( "does not remove https:// on its own", function() {
+		expect( removeURLs( "https://" ) ).toBe( "https://" );
+	} );
+	it( "does not remove a URL without a top-level domain", function() {
+		expect( removeURLs( "https://example" ) ).toBe( "https://example" );
+	} );
+} );
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
@@ -35,7 +35,7 @@ describe( "A test to count sentence lengths.", function() {
 
 		expect( lengths ).toEqual( [
 			{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
-			{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 11 },
+			{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 10 },
 		] );
 	} );
 } );
diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
@@ -1,4 +1,3 @@
-import { doesWordMatchRegex } from "../../../../../src/languageProcessing/helpers/morphology/regexHelpers";
 import countCharactersFunction from "../../../../../src/languageProcessing/languages/ja/helpers/countCharacters.js";
 
 describe( "counts characters in a string", function() {
@@ -13,9 +12,9 @@ describe( "counts characters in a string", function() {
 			"東京オリンピック開会直前の1964年（昭和39年）10月1日に開業した。" ) ).toBe( 136 );
 	} );
 	it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() {
-		expect( countCharactersFunction( "this is a string" ) ).toBe( 16 );
+		expect( countCharactersFunction( "this is a string" ) ).toBe( 13 );
 		expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " +
-			"(представляващи краен брой знаци)." ) ).toBe( 90 );
+			"(представляващи краен брой знаци)." ) ).toBe( 78 );
 	} );
 	it( "makes sure that the table of contents is excluded from the calculation", function() {
 		const text = "<div class=\"wp-block-yoast-seo-table-of-contents yoast-table-of-contents\"><h2>目次</h2><ul><li><a " +
@@ -36,9 +35,9 @@ describe( "counts characters in a string", function() {
 			"<p>戦後においては一般に広義の<a href=\"https://ja.wikipedia.org/wiki/%E7%AB%A5%E8%AC%A1\">童謡</a>にカテゴライズされる本作品は、" +
 			"初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" +
 			"本作品を歌うことは原則上はできなかった。</p>";
-		expect( countCharactersFunction( text ) ).toBe( 757 );
+		expect( countCharactersFunction( text ) ).toBe( 744 );
 	} );
-	it( "makes sure that no charachters are counted when a URL is embedded in video tags", function() {
+	it( "makes sure that no characters are counted when a URL is embedded in video tags", function() {
 		const text = "<!-- wp:embed {\"url\":\"https://www.youtube.com/watch?v=cbP2N1BQdYc\",\"type\":\"video\"," +
 			"\"providerNameSlug\":\"youtube\",\"responsive\":true,\"className\":\"wp-embed-aspect-16-9 wp-has-aspect-ratio\"} -->\n" +
 			"\t\t\t<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect" +
@@ -48,22 +47,3 @@ describe( "counts characters in a string", function() {
 		expect( countCharactersFunction( text ) ).toBe( 0 );
 	} );
 } );
-
-describe( "A test to return a regex match for URLs", () => {
-	it( "Returns true if there is a match against a URL starting with www", () => {
-		// eslint-disable-next-line max-len
-		expect( doesWordMatchRegex( "www.yoast.com", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
-	} );
-	it( "Returns true if there is a match against a URL starting with https", () => {
-		// eslint-disable-next-line max-len
-		expect( doesWordMatchRegex( "https://www.codecademy.com/learn/hello", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
-	} );
-	it( "Returns true if there is a match against a URL starting with http", () => {
-		// eslint-disable-next-line max-len
-		expect( doesWordMatchRegex( "http://foo.com/blah_blah/", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
-	} );
-	it( "Returns false if there is no match", () => {
-		// eslint-disable-next-line max-len
-		expect( doesWordMatchRegex( "My cat is sweet!", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( false );
-	} );
-} );
diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
@@ -175,3 +175,42 @@ describe( "test for prominent words research for languages that have custom help
 		expect( words ).toEqual( expected );
 	} );
 } );
+
+describe( "test for filtering out URLs and email addresses", function() {
+	it( "does not include URLs in prominent words", function() {
+		const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+		const researcher = new Researcher( paper );
+		researcher.addResearchData( "morphology", morphologyData );
+
+		const words = getProminentWordsForInsights( paper, researcher );
+		expect( words ).toEqual( [
+			new ProminentWord( "cats", "cat", 50 ),
+		] );
+	} );
+
+	it( "does not include email addresses in prominent words", function() {
+		const paper = new Paper( "example89@something.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+		const researcher = new Researcher( paper );
+		researcher.addResearchData( "morphology", morphologyData );
+
+		const words = getProminentWordsForInsights( paper, researcher );
+		expect( words ).toEqual( [
+			new ProminentWord( "cats", "cat", 50 ),
+		] );
+	} );
+
+	it( "includes domain names in prominent words", function() {
+		const paper = new Paper( "example.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+		const researcher = new Researcher( paper );
+		researcher.addResearchData( "morphology", morphologyData );
+
+		const words = getProminentWordsForInsights( paper, researcher );
+		expect( words ).toEqual( [
+			new ProminentWord( "example.com", "example.com", 180 ),
+			new ProminentWord( "cats", "cat", 50 ),
+		] );
+	} );
+} );
diff --git a/...es/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js b/...es/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
@@ -381,5 +381,73 @@ describe( "test for prominent words research for languages that have custom help
 
 		expect( words ).toEqual( expected );
 	} );
+
+	it( "does not count URLs and email addresses as prominent words", function() {
+		const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ),
+			{ title: "example@something.com example@something.com example@something.com" } );
+
+		const researcher = new Researcher( paper );
+
+		const expected = {
+			prominentWords: [],
+			hasMetaDescription: false,
+			hasTitle: true,
+		};
+
+		const words = prominentWordsResearch( paper, researcher );
+
+		expect( words ).toEqual( expected );
+	} );
+
+	it( "counts domain names as prominent words", function() {
+		const paper = new Paper( "yoast.com ".repeat( 180 ) );
+
+		const researcher = new Researcher( paper );
+
+		const expected = {
+			prominentWords: [ new ProminentWord( "yoast.com", "yoast.com", 180 ) ],
+			hasMetaDescription: false,
+			hasTitle: false,
+		};
+
+		const words = prominentWordsResearch( paper, researcher );
+
+		expect( words ).toEqual( expected );
+	} );
+
+	it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" +
+		"than 100 words when they are excluded", function() {
+		const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
+			" cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } );
+
+		const researcher = new Researcher( paper );
+
+		const expected = {
+			prominentWords: [],
+			hasMetaDescription: false,
+			hasTitle: true,
+		};
+
+		const words = prominentWordsResearch( paper, researcher );
+
+		expect( words ).toEqual( expected );
+	} );
+
+	it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() {
+		const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
+			" cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } );
+
+		const researcher = new Researcher( paper );
+
+		const expected = {
+			prominentWords: [ new ProminentWord( "cats", "cats", 101 ) ],
+			hasMetaDescription: false,
+			hasTitle: true,
+		};
+
+		const words = prominentWordsResearch( paper, researcher );
+
+		expect( words ).toEqual( expected );
+	} );
 } );
 
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js
@@ -0,0 +1,12 @@
+const emailRegex = new RegExp( "[^\\s@]+@[^\\s@]+\\.[^\\s@]+", "igm" );
+
+/**
+ * Removes email addresses from a text.
+ *
+ * @param {string} text The text to remove emails from.
+ *
+ * @returns {string} The text without email addresses.
+ */
+export default function( text ) {
+	return text.replace( emailRegex, "" );
+}
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
@@ -0,0 +1,13 @@
+const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" +
+	"|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" );
+/**
+ * This regex is used to match URLs, whether they are embedded in tags or not.
+ * It doesn't match domain names (e.g. "yoast.com" in "We got so much traffic on yoast.com after the latest release").
+ *
+ * @param {string} text The text to remove URLs from.
+ *
+ * @returns {string} The text without URLs.
+ */
+export default function( text ) {
+	return text.replace( urlRegex, "" );
+}
diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
@@ -1,19 +1,20 @@
 import { languageProcessing } from "yoastseo";
 const { sanitizeString } = languageProcessing;
+import removeURLs from "../../../helpers/sanitize/removeURLs.js";
 
 /**
- * Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text.
+ * Calculates the character count which serves as a measure of text length.
+ * The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, spaces, and the
+ * content of the Table of Contents and Estimated Reading Time blocks.
  *
  * @param {string} text The text to be counted.
  *
  * @returns {number} The character count of the given text.
  */
 export default function( text ) {
-	// This regex is used to match URLs in the text, either embedded in tags or not, so that they are excluded from the characters count.
-	// eslint-disable-next-line max-len
-	const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
-	text = text.replace( urlRegex, "" );
+	text = removeURLs( text );
 	text = sanitizeString( text );
+	text = text.replace( /\s/g, "" );
 
 	return text.length;
 }
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
@@ -6,6 +6,8 @@ import {
 	retrieveAbbreviations,
 	sortProminentWords,
 } from "../helpers/prominentWords/determineProminentWords";
+import removeURLs from "../helpers/sanitize/removeURLs.js";
+import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
 
 /**
  * Retrieves the prominent words from the given paper.
@@ -23,7 +25,10 @@ function getProminentWordsForInsights( paper, researcher ) {
 	// An optional custom helper to get words from the text.
 	const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" );
 
-	const text = paper.getText();
+	let text = paper.getText();
+	// We don't want to include URLs or email addresses in prominent words.
+	text = removeURLs( text );
+	text = removeEmailAddresses( text );
 
 	// If the language has a custom helper to get words from the text, we don't retrieve the abbreviation.
 	const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text );

diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
@@ -10,6 +10,20 @@ import {
 } from "../helpers/prominentWords/determineProminentWords";
 import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings";
 import baseStemmer from "../helpers/morphology/baseStemmer";
+import removeURLs from "../helpers/sanitize/removeURLs.js";
+import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
+
+/**
+ * Removes URLs and email addresses from the text.
+ *
+ * @param {string}	text	The text to sanitize.
+ *
+ * @returns {string} The text without URLs and email addresses.
+ */
+const sanitizeText = function( text ) {
+	text = removeURLs( text );
+	return removeEmailAddresses( text );
+};
 
 /**
  * Retrieves the prominent words from the given paper.
@@ -32,9 +46,9 @@ function getProminentWordsForInternalLinking( paper, researcher ) {
 	// An optional custom helper to count length to use instead of countWords.
 	const customCountLength = researcher.getHelper( "customCountLength" );
 
-	const text = paper.getText();
-	const metadescription = paper.getDescription();
-	const title = paper.getTitle();
+	const text = sanitizeText( paper.getText() );
+	const metadescription = sanitizeText( paper.getDescription() );
+	const title = sanitizeText( paper.getTitle() );
 
 	const result = {};
 	result.hasMetaDescription = metadescription !== "";