From 31262ae8d9fbbc73d5930b8f5021e32a2bcd47a3 Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Thu, 3 Nov 2022 17:32:43 +0100
Subject: [PATCH 01/14] Exclude links from prominent words
---
.../helpers/sanitize/removeURLs.js | 13 +++++++++++++
.../languages/ja/helpers/countCharacters.js | 6 ++----
.../researches/getProminentWordsForInsights.js | 3 ++-
.../getProminentWordsForInternalLinking.js | 7 ++++---
4 files changed, 21 insertions(+), 8 deletions(-)
create mode 100644 packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
new file mode 100644
index 00000000000..61315352a06
--- /dev/null
+++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
@@ -0,0 +1,13 @@
+const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b" +
+ "([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
+
+/**
+ * Removes URLs from a text.
+ *
+ * @param {string} text The text to remove URLs from.
+ *
+ * @returns {string} The text without URLs.
+ */
+export default function( text ) {
+ return text.replace( urlRegex, "" );
+}
diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
index b18ce6d84e6..782d58371dd 100644
--- a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
+++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
@@ -1,5 +1,6 @@
import { languageProcessing } from "yoastseo";
const { sanitizeString } = languageProcessing;
+import removeURLs from "../../../helpers/sanitize/removeURLs.js";
/**
* Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text.
@@ -9,10 +10,7 @@ const { sanitizeString } = languageProcessing;
* @returns {number} The character count of the given text.
*/
export default function( text ) {
- // This regex is used to match URLs in the text, either embedded in tags or not, so that they are excluded from the characters count.
- // eslint-disable-next-line max-len
- const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
- text = text.replace( urlRegex, "" );
+ text = removeURLs( text );
text = sanitizeString( text );
return text.length;
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
index 7e994d648e9..7ba268d0076 100644
--- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
+++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
@@ -6,6 +6,7 @@ import {
retrieveAbbreviations,
sortProminentWords,
} from "../helpers/prominentWords/determineProminentWords";
+import removeURLs from "../helpers/sanitize/removeURLs.js";
/**
* Retrieves the prominent words from the given paper.
@@ -23,7 +24,7 @@ function getProminentWordsForInsights( paper, researcher ) {
// An optional custom helper to get words from the text.
const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" );
- const text = paper.getText();
+ const text = removeURLs( paper.getText() );
// If the language has a custom helper to get words from the text, we don't retrieve the abbreviation.
const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text );
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
index 46faefc1b4e..560664a4743 100644
--- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
+++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
@@ -10,6 +10,7 @@ import {
} from "../helpers/prominentWords/determineProminentWords";
import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings";
import baseStemmer from "../helpers/morphology/baseStemmer";
+import removeURLs from "../helpers/sanitize/removeURLs.js";
/**
* Retrieves the prominent words from the given paper.
@@ -32,9 +33,9 @@ function getProminentWordsForInternalLinking( paper, researcher ) {
// An optional custom helper to count length to use instead of countWords.
const customCountLength = researcher.getHelper( "customCountLength" );
- const text = paper.getText();
- const metadescription = paper.getDescription();
- const title = paper.getTitle();
+ const text = removeURLs( paper.getText() );
+ const metadescription = removeURLs( paper.getDescription() );
+ const title = removeURLs( paper.getTitle() );
const result = {};
result.hasMetaDescription = metadescription !== "";
From a331571f9d45a87ffb09e3406289edc9c72cd4dd Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Mon, 7 Nov 2022 13:57:23 +0100
Subject: [PATCH 02/14] Add unit tests
---
.../sanitize/removeEmailAddressesSpec.js | 0
.../helpers/sanitize/removeURLsSpec.js | 34 +++++++++++++++++++
2 files changed, 34 insertions(+)
create mode 100644 packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
create mode 100644 packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
new file mode 100644
index 00000000000..e69b6eace70
--- /dev/null
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
@@ -0,0 +1,34 @@
+import removeURLs from "../../../../src/languageProcessing/helpers/sanitize/removeURLs.js";
+
+describe( "a test for removing URLs from a string", function() {
+ it( "removes a base URL", function() {
+ expect( removeURLs( "https://example.com" ) ).toBe( "" );
+ } );
+ it( "removes a URL followed by a subdirectory", function() {
+ expect( removeURLs( "https://example.com/example1" ) ).toBe( "" );
+ } );
+ it( "removes a URL followed by multiple subdirectories", function() {
+ expect( removeURLs( "https://example.com/example1/part1" ) ).toBe( "" );
+ } );
+ it( "removes a URL with a subdomain", function() {
+ expect( removeURLs( "https://blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL starting with http://", function() {
+ expect( removeURLs( "http://blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL containing www.", function() {
+ expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL containing special characters.", function() {
+ expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
+ } );
+ it( "removes a URL containing more special characters.", function() {
+ expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" );
+ } );
+ it( "removes a URL with a different top-level domain", function() {
+ expect( removeURLs( "http://example.co.uk" ) ).toBe( "" );
+ } );
+ it( "does not remove a string if it doesn't start with http(s)://", function() {
+ expect( removeURLs( "example.com" ) ).toBe( "blog.example.com/examples" );
+ } );
+} );
From 8c9d663c68dfc1fee8846e2e6f1f218d6922dc60 Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Mon, 7 Nov 2022 13:57:33 +0100
Subject: [PATCH 03/14] Add helper for removing email addresses
---
.../helpers/sanitize/removeEmailAddresses.js | 12 ++++++++++++
1 file changed, 12 insertions(+)
create mode 100644 packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js
new file mode 100644
index 00000000000..48e2c76029d
--- /dev/null
+++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js
@@ -0,0 +1,12 @@
+const emailRegex = new RegExp( "[^\\s@]+@[^\\s@]+\\.[^\\s@]+", "igm" );
+
+/**
+ * Removes email addresses from a text.
+ *
+ * @param {string} text The text to remove emails from.
+ *
+ * @returns {string} The text without email addresses.
+ */
+export default function( text ) {
+ return text.replace( emailRegex, "" );
+}
From 89957a081d06a9e9c359475b9797a77e63c5923e Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Mon, 7 Nov 2022 13:58:14 +0100
Subject: [PATCH 04/14] Remove email addresses from the text before retrieving
the prominent words
---
.../researches/getProminentWordsForInsights.js | 6 +++++-
.../researches/getProminentWordsForInternalLinking.js | 7 ++++++-
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
index 7ba268d0076..0079fb5442b 100644
--- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
+++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
@@ -7,6 +7,7 @@ import {
sortProminentWords,
} from "../helpers/prominentWords/determineProminentWords";
import removeURLs from "../helpers/sanitize/removeURLs.js";
+import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
/**
* Retrieves the prominent words from the given paper.
@@ -24,7 +25,10 @@ function getProminentWordsForInsights( paper, researcher ) {
// An optional custom helper to get words from the text.
const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" );
- const text = removeURLs( paper.getText() );
+ let text = paper.getText();
+ // We don't want to include URLs or email addresses in prominent words.
+ text = removeURLs( text );
+ text = removeEmailAddresses( text );
// If the language has a custom helper to get words from the text, we don't retrieve the abbreviation.
const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text );
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
index 560664a4743..66564bbe6fa 100644
--- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
+++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
@@ -11,6 +11,7 @@ import {
import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings";
import baseStemmer from "../helpers/morphology/baseStemmer";
import removeURLs from "../helpers/sanitize/removeURLs.js";
+import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
/**
* Retrieves the prominent words from the given paper.
@@ -33,7 +34,11 @@ function getProminentWordsForInternalLinking( paper, researcher ) {
// An optional custom helper to count length to use instead of countWords.
const customCountLength = researcher.getHelper( "customCountLength" );
- const text = removeURLs( paper.getText() );
+ let text = paper.getText();
+ // We don't want to include URLs or email addresses in prominent words.
+ text = removeURLs( text );
+ text = removeEmailAddresses( text );
+
const metadescription = removeURLs( paper.getDescription() );
const title = removeURLs( paper.getTitle() );
From d4ff1e7794c73f712fc929e378d5efb176eb0fad Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Mon, 7 Nov 2022 17:03:32 +0100
Subject: [PATCH 05/14] Change the regex so that it doesn't match URLs that
don't start with http(s), ftp, or www
For example 'yoast.com' - this can be used as a word, not necessarily a link, like "There was so much traffic on yoast.com today"
---
.../src/languageProcessing/helpers/sanitize/removeURLs.js | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
index 61315352a06..727fbc0237f 100644
--- a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
+++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
@@ -1,6 +1,5 @@
-const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b" +
- "([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
-
+const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" +
+ "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
/**
* Removes URLs from a text.
*
From 63df888afa8c6cb40b8f604b5ac0ce4096aaf0fb Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Mon, 7 Nov 2022 17:03:48 +0100
Subject: [PATCH 06/14] Add more unit tests
---
.../sanitize/removeEmailAddressesSpec.js | 19 +++++++++++++++++
.../helpers/sanitize/removeURLsSpec.js | 19 +++++++++++++++--
.../ja/helpers/countCharactersSpec.js | 21 +------------------
3 files changed, 37 insertions(+), 22 deletions(-)
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
index e69de29bb2d..31178173848 100644
--- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
@@ -0,0 +1,19 @@
+import removeEmailAddresses from "../../../../src/languageProcessing/helpers/sanitize/removeEmailAddresses.js";
+
+describe( "a test for removing email addresses from a string", function() {
+ it( "removes an email address", function() {
+ expect( removeEmailAddresses( "example@something.com" ) ).toBe( "" );
+ } );
+ it( "removes an email address with special characters", function() {
+ expect( removeEmailAddresses( "some+long+email+address23@some+host-weird-/looking.com" ) ).toBe( "" );
+ } );
+ it( "removes a very short email address", function() {
+ expect( removeEmailAddresses( "a@b.com" ) ).toBe( "" );
+ } );
+ it( "does not remove invalid email addresses", function() {
+ expect( removeEmailAddresses( "@b.com" ) ).toBe( "@b.com" );
+ expect( removeEmailAddresses( "a@b" ) ).toBe( "a@b" );
+ expect( removeEmailAddresses( "example@" ) ).toBe( "example@" );
+ expect( removeEmailAddresses( "example.com" ) ).toBe( "example.com" );
+ } );
+} );
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
index e69b6eace70..2ec9429357a 100644
--- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
@@ -19,6 +19,12 @@ describe( "a test for removing URLs from a string", function() {
it( "removes a URL containing www.", function() {
expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" );
} );
+ it( "removes a URL starting with www.", function() {
+ expect( removeURLs( "www.blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL starting with ftp", function() {
+ expect( removeURLs( "ftp://example.com" ) ).toBe( "" );
+ } );
it( "removes a URL containing special characters.", function() {
expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
} );
@@ -28,7 +34,16 @@ describe( "a test for removing URLs from a string", function() {
it( "removes a URL with a different top-level domain", function() {
expect( removeURLs( "http://example.co.uk" ) ).toBe( "" );
} );
- it( "does not remove a string if it doesn't start with http(s)://", function() {
- expect( removeURLs( "example.com" ) ).toBe( "blog.example.com/examples" );
+ it( "removes a URL followed by Japanese characters", function() {
+ expect( removeURLs( "https://example.comこれに対し日本国有鉄道" ) ).toBe( "これに対し日本国有鉄道" );
+ } );
+ it( "does not remove a URL that doesn't start with 'http(s)://', 'ftp://' or 'www'.", function() {
+ expect( removeURLs( "example.com" ) ).toBe( "example.com" );
+ } );
+ it( "does not remove https:// on its own", function() {
+ expect( removeURLs( "https://" ) ).toBe( "https://" );
+ } );
+ it( "does not remove a URL without a top-level domain", function() {
+ expect( removeURLs( "https://example" ) ).toBe( "https://example" );
} );
} );
diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
index 69cc0a18e6e..fb5bf8fbbc5 100644
--- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
@@ -38,7 +38,7 @@ describe( "counts characters in a string", function() {
"本作品を歌うことは原則上はできなかった。
";
expect( countCharactersFunction( text ) ).toBe( 757 );
} );
- it( "makes sure that no charachters are counted when a URL is embedded in video tags", function() {
+ it( "makes sure that no characters are counted when a URL is embedded in video tags", function() {
const text = "\n" +
"\t\t\t {
- it( "Returns true if there is a match against a URL starting with www", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "www.yoast.com", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
- } );
- it( "Returns true if there is a match against a URL starting with https", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "https://www.codecademy.com/learn/hello", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
- } );
- it( "Returns true if there is a match against a URL starting with http", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "http://foo.com/blah_blah/", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
- } );
- it( "Returns false if there is no match", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "My cat is sweet!", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( false );
- } );
-} );
From 32c1ec6ed66e08da76a543f7dbd5608fa4d47851 Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Mon, 7 Nov 2022 17:39:12 +0100
Subject: [PATCH 07/14] Fix code style
---
.../ja/helpers/countCharactersSpec.js | 1 -
.../getProminentWordsForInternalLinking.js | 22 +++++++++++++------
2 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
index fb5bf8fbbc5..f154c85be5c 100644
--- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
@@ -1,4 +1,3 @@
-import { doesWordMatchRegex } from "../../../../../src/languageProcessing/helpers/morphology/regexHelpers";
import countCharactersFunction from "../../../../../src/languageProcessing/languages/ja/helpers/countCharacters.js";
describe( "counts characters in a string", function() {
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
index 66564bbe6fa..1894fc44ff7 100644
--- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
+++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
@@ -13,6 +13,18 @@ import baseStemmer from "../helpers/morphology/baseStemmer";
import removeURLs from "../helpers/sanitize/removeURLs.js";
import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
+/**
+ * Removes URLs and email addresses from the text.
+ *
+ * @param {string} text The text to sanitize.
+ *
+ * @returns {string} The text without URLs and email addresses.
+ */
+const sanitizeText = function( text ) {
+ text = removeURLs( text );
+ return removeEmailAddresses( text );
+};
+
/**
* Retrieves the prominent words from the given paper.
*
@@ -34,13 +46,9 @@ function getProminentWordsForInternalLinking( paper, researcher ) {
// An optional custom helper to count length to use instead of countWords.
const customCountLength = researcher.getHelper( "customCountLength" );
- let text = paper.getText();
- // We don't want to include URLs or email addresses in prominent words.
- text = removeURLs( text );
- text = removeEmailAddresses( text );
-
- const metadescription = removeURLs( paper.getDescription() );
- const title = removeURLs( paper.getTitle() );
+ const text = sanitizeText( paper.getText() );
+ const metadescription = sanitizeText( paper.getDescription() );
+ const title = sanitizeText( paper.getTitle() );
const result = {};
result.hasMetaDescription = metadescription !== "";
From e1ee7760a4e26436d1fcf6ae3eadb4aa60df5e74 Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Tue, 8 Nov 2022 10:48:27 +0100
Subject: [PATCH 08/14] Add unit tests to
getProminentWordsForInternalLinkingSpec
---
...getProminentWordsForInternalLinkingSpec.js | 52 +++++++++++++++++++
1 file changed, 52 insertions(+)
diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
index 8bb8b2df8cf..0b72dc562bc 100644
--- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
@@ -381,5 +381,57 @@ describe( "test for prominent words research for languages that have custom help
expect( words ).toEqual( expected );
} );
+
+ it( "does not count URLs and email addresses as prominent words", function() {
+ const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ),
+ { title: "example@something.com example@something.com example@something.com" } );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [],
+ hasMetaDescription: false,
+ hasTitle: true,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
+
+ it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" +
+ "than 100 words when they are excluded", function() {
+ const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) +
+ " cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [],
+ hasMetaDescription: false,
+ hasTitle: true,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
+
+ it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() {
+ const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) +
+ " cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [ new ProminentWord( "cats", "cats", 101 ) ],
+ hasMetaDescription: false,
+ hasTitle: true,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
} );
From 7011f0b816e567a057ffa005ce9acd720c8a544c Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Tue, 8 Nov 2022 12:35:57 +0100
Subject: [PATCH 09/14] Adds a semi-colon to the characters that can appear
after the top-level domain
This was added as a fix to WP editor converting & to &
---
.../languageProcessing/helpers/sanitize/removeURLsSpec.js | 3 +++
.../src/languageProcessing/helpers/sanitize/removeURLs.js | 4 ++--
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
index 2ec9429357a..4e084f79ef0 100644
--- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
@@ -28,6 +28,9 @@ describe( "a test for removing URLs from a string", function() {
it( "removes a URL containing special characters.", function() {
expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
} );
+ it( "removes a URL containing a semi-colon.", function() {
+ expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
+ } );
it( "removes a URL containing more special characters.", function() {
expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" );
} );
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
index 727fbc0237f..c83b006a345 100644
--- a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
+++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
@@ -1,5 +1,5 @@
-const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" +
- "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
+const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" +
+ "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" );
/**
* Removes URLs from a text.
*
From e60b3c7d92ccdedd9482f9303d9fd6a1b0642e95 Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Tue, 8 Nov 2022 16:56:22 +0100
Subject: [PATCH 10/14] Add unit tests for prominent words for insights
research
---
.../getProminentWordsForInsightsSpec.js | 39 +++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
index 16e1b189079..646885a69a5 100644
--- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
@@ -175,3 +175,42 @@ describe( "test for prominent words research for languages that have custom help
expect( words ).toEqual( expected );
} );
} );
+
+describe( "test for filtering out of URLs and email addresses", function() {
+ it( "does not include URLs in prominent words", function() {
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+ const researcher = new Researcher( paper );
+ researcher.addResearchData( "morphology", morphologyData );
+
+ const words = getProminentWordsForInsights( paper, researcher );
+ expect( words ).toEqual( [
+ new ProminentWord( "cats", "cat", 50 ),
+ ] );
+ } );
+
+ it( "does not include email addresses in prominent words", function() {
+ const paper = new Paper( "example89@something.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+ const researcher = new Researcher( paper );
+ researcher.addResearchData( "morphology", morphologyData );
+
+ const words = getProminentWordsForInsights( paper, researcher );
+ expect( words ).toEqual( [
+ new ProminentWord( "cats", "cat", 50 ),
+ ] );
+ } );
+
+ it( "includes domain names in prominent words", function() {
+ const paper = new Paper( "example.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+ const researcher = new Researcher( paper );
+ researcher.addResearchData( "morphology", morphologyData );
+
+ const words = getProminentWordsForInsights( paper, researcher );
+ expect( words ).toEqual( [
+ new ProminentWord( "example.com", "example.com", 180 ),
+ new ProminentWord( "cats", "cat", 50 ),
+ ] );
+ } );
+} );
From 1e2a7e6938978b0a48f82f1edc750a5dc3a0f9ef Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Tue, 8 Nov 2022 16:56:43 +0100
Subject: [PATCH 11/14] Adjust prominent words for internal linking research
unit tests
---
...getProminentWordsForInternalLinkingSpec.js | 22 ++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
index 0b72dc562bc..fe232aaa901 100644
--- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
@@ -383,7 +383,7 @@ describe( "test for prominent words research for languages that have custom help
} );
it( "does not count URLs and email addresses as prominent words", function() {
- const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ),
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ),
{ title: "example@something.com example@something.com example@something.com" } );
const researcher = new Researcher( paper );
@@ -399,9 +399,25 @@ describe( "test for prominent words research for languages that have custom help
expect( words ).toEqual( expected );
} );
+ it( "counts domain names as prominent words", function() {
+ const paper = new Paper( "yoast.com ".repeat( 180 ) );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [ new ProminentWord( "yoast.com", "yoast.com", 180 ) ],
+ hasMetaDescription: false,
+ hasTitle: false,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
+
it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" +
"than 100 words when they are excluded", function() {
- const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) +
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
" cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } );
const researcher = new Researcher( paper );
@@ -418,7 +434,7 @@ describe( "test for prominent words research for languages that have custom help
} );
it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() {
- const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) +
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
" cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } );
const researcher = new Researcher( paper );
From 43811c8f1a975736a668c7378cc044eb9f43baab Mon Sep 17 00:00:00 2001
From: Marina Koleva
Date: Thu, 10 Nov 2022 13:52:04 +0100
Subject: [PATCH 12/14] adding a comment to removeURLs.js
---
.../researches/getProminentWordsForInsightsSpec.js | 2 +-
.../src/languageProcessing/helpers/sanitize/removeURLs.js | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
index 646885a69a5..19a9b89048a 100644
--- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
@@ -176,7 +176,7 @@ describe( "test for prominent words research for languages that have custom help
} );
} );
-describe( "test for filtering out of URLs and email addresses", function() {
+describe( "test for filtering out URLs and email addresses", function() {
it( "does not include URLs in prominent words", function() {
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) );
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
index c83b006a345..309e222ab74 100644
--- a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
+++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
@@ -1,7 +1,8 @@
const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" +
"|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" );
/**
- * Removes URLs from a text.
+ * This regex is used to match URLs, whether they are embedded in tags or not.
+ * It doesn't match domain names (e.g. "yoast.com" in "We got so much traffic on yoast.com after the latest release").
*
* @param {string} text The text to remove URLs from.
*
From cc3179c6ef0f345a8dd31ec61ad5d8f71e5516e2 Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Tue, 29 Nov 2022 16:38:25 +0100
Subject: [PATCH 13/14] Remove spaces from Japanese text before counting
characters
---
.../languages/ja/helpers/countCharacters.js | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
index 782d58371dd..5c2cec887af 100644
--- a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
+++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
@@ -3,7 +3,9 @@ const { sanitizeString } = languageProcessing;
import removeURLs from "../../../helpers/sanitize/removeURLs.js";
/**
- * Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text.
+ * Calculates the character count which serves as a measure of text length.
+ * The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, spaces, and the
+ * content of the Table of Contents and Estimated Reading Time blocks.
*
* @param {string} text The text to be counted.
*
@@ -12,6 +14,7 @@ import removeURLs from "../../../helpers/sanitize/removeURLs.js";
export default function( text ) {
text = removeURLs( text );
text = sanitizeString( text );
+ text = text.replace( /\s/g, "" );
return text.length;
}
From f940c11051b6a9892fb52f6666aa113a034df603 Mon Sep 17 00:00:00 2001
From: Agnieszka Szuba
Date: Tue, 29 Nov 2022 17:24:55 +0100
Subject: [PATCH 14/14] Fix unit tests
---
.../spec/fullTextTests/testTexts/ja/japanesePaper.js | 2 +-
.../helpers/sentence/sentencesLengthSpec.js | 2 +-
.../languages/ja/helpers/countCharactersSpec.js | 6 +++---
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
index 915451bc5df..39d9f79129a 100644
--- a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
+++ b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
@@ -66,7 +66,7 @@ const expectedResults = {
textLength: {
isApplicable: true,
score: 9,
- resultText: "Text length: The text contains 3165 characters. Good job!",
+ resultText: "Text length: The text contains 3022 characters. Good job!",
},
externalLinks: {
isApplicable: true,
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
index 5a83c81d57c..0b50b8e7b7b 100644
--- a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
@@ -35,7 +35,7 @@ describe( "A test to count sentence lengths.", function() {
expect( lengths ).toEqual( [
{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
- { sentence: "歩くさわやかな森 自然 ", sentenceLength: 11 },
+ { sentence: "歩くさわやかな森 自然 ", sentenceLength: 10 },
] );
} );
} );
diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
index f154c85be5c..a601da67476 100644
--- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
@@ -12,9 +12,9 @@ describe( "counts characters in a string", function() {
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
} );
it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() {
- expect( countCharactersFunction( "this is a string" ) ).toBe( 16 );
+ expect( countCharactersFunction( "this is a string" ) ).toBe( 13 );
expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " +
- "(представляващи краен брой знаци)." ) ).toBe( 90 );
+ "(представляващи краен брой знаци)." ) ).toBe( 78 );
} );
it( "makes sure that the table of contents is excluded from the calculation", function() {
const text = "
目次
戦後においては一般に広義の童謡にカテゴライズされる本作品は、" +
"初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" +
"本作品を歌うことは原則上はできなかった。";
- expect( countCharactersFunction( text ) ).toBe( 757 );
+ expect( countCharactersFunction( text ) ).toBe( 744 );
} );
it( "makes sure that no characters are counted when a URL is embedded in video tags", function() {
const text = "