From 31262ae8d9fbbc73d5930b8f5021e32a2bcd47a3 Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Thu, 3 Nov 2022 17:32:43 +0100 Subject: [PATCH 01/14] Exclude links from prominent words --- .../helpers/sanitize/removeURLs.js | 13 +++++++++++++ .../languages/ja/helpers/countCharacters.js | 6 ++---- .../researches/getProminentWordsForInsights.js | 3 ++- .../getProminentWordsForInternalLinking.js | 7 ++++--- 4 files changed, 21 insertions(+), 8 deletions(-) create mode 100644 packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js new file mode 100644 index 00000000000..61315352a06 --- /dev/null +++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js @@ -0,0 +1,13 @@ +const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b" + + "([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" ); + +/** + * Removes URLs from a text. + * + * @param {string} text The text to remove URLs from. + * + * @returns {string} The text without URLs. + */ +export default function( text ) { + return text.replace( urlRegex, "" ); +} diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js index b18ce6d84e6..782d58371dd 100644 --- a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js +++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js @@ -1,5 +1,6 @@ import { languageProcessing } from "yoastseo"; const { sanitizeString } = languageProcessing; +import removeURLs from "../../../helpers/sanitize/removeURLs.js"; /** * Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text. @@ -9,10 +10,7 @@ const { sanitizeString } = languageProcessing; * @returns {number} The character count of the given text. */ export default function( text ) { - // This regex is used to match URLs in the text, either embedded in tags or not, so that they are excluded from the characters count. - // eslint-disable-next-line max-len - const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" ); - text = text.replace( urlRegex, "" ); + text = removeURLs( text ); text = sanitizeString( text ); return text.length; diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js index 7e994d648e9..7ba268d0076 100644 --- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js +++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js @@ -6,6 +6,7 @@ import { retrieveAbbreviations, sortProminentWords, } from "../helpers/prominentWords/determineProminentWords"; +import removeURLs from "../helpers/sanitize/removeURLs.js"; /** * Retrieves the prominent words from the given paper. @@ -23,7 +24,7 @@ function getProminentWordsForInsights( paper, researcher ) { // An optional custom helper to get words from the text. const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" ); - const text = paper.getText(); + const text = removeURLs( paper.getText() ); // If the language has a custom helper to get words from the text, we don't retrieve the abbreviation. const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text ); diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js index 46faefc1b4e..560664a4743 100644 --- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js +++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js @@ -10,6 +10,7 @@ import { } from "../helpers/prominentWords/determineProminentWords"; import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings"; import baseStemmer from "../helpers/morphology/baseStemmer"; +import removeURLs from "../helpers/sanitize/removeURLs.js"; /** * Retrieves the prominent words from the given paper. @@ -32,9 +33,9 @@ function getProminentWordsForInternalLinking( paper, researcher ) { // An optional custom helper to count length to use instead of countWords. const customCountLength = researcher.getHelper( "customCountLength" ); - const text = paper.getText(); - const metadescription = paper.getDescription(); - const title = paper.getTitle(); + const text = removeURLs( paper.getText() ); + const metadescription = removeURLs( paper.getDescription() ); + const title = removeURLs( paper.getTitle() ); const result = {}; result.hasMetaDescription = metadescription !== ""; From a331571f9d45a87ffb09e3406289edc9c72cd4dd Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Mon, 7 Nov 2022 13:57:23 +0100 Subject: [PATCH 02/14] Add unit tests --- .../sanitize/removeEmailAddressesSpec.js | 0 .../helpers/sanitize/removeURLsSpec.js | 34 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js create mode 100644 packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js new file mode 100644 index 00000000000..e69de29bb2d diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js new file mode 100644 index 00000000000..e69b6eace70 --- /dev/null +++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js @@ -0,0 +1,34 @@ +import removeURLs from "../../../../src/languageProcessing/helpers/sanitize/removeURLs.js"; + +describe( "a test for removing URLs from a string", function() { + it( "removes a base URL", function() { + expect( removeURLs( "https://example.com" ) ).toBe( "" ); + } ); + it( "removes a URL followed by a subdirectory", function() { + expect( removeURLs( "https://example.com/example1" ) ).toBe( "" ); + } ); + it( "removes a URL followed by multiple subdirectories", function() { + expect( removeURLs( "https://example.com/example1/part1" ) ).toBe( "" ); + } ); + it( "removes a URL with a subdomain", function() { + expect( removeURLs( "https://blog.example.com/examples" ) ).toBe( "" ); + } ); + it( "removes a URL starting with http://", function() { + expect( removeURLs( "http://blog.example.com/examples" ) ).toBe( "" ); + } ); + it( "removes a URL containing www.", function() { + expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" ); + } ); + it( "removes a URL containing special characters.", function() { + expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" ); + } ); + it( "removes a URL containing more special characters.", function() { + expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" ); + } ); + it( "removes a URL with a different top-level domain", function() { + expect( removeURLs( "http://example.co.uk" ) ).toBe( "" ); + } ); + it( "does not remove a string if it doesn't start with http(s)://", function() { + expect( removeURLs( "example.com" ) ).toBe( "blog.example.com/examples" ); + } ); +} ); From 8c9d663c68dfc1fee8846e2e6f1f218d6922dc60 Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Mon, 7 Nov 2022 13:57:33 +0100 Subject: [PATCH 03/14] Add helper for removing email addresses --- .../helpers/sanitize/removeEmailAddresses.js | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js new file mode 100644 index 00000000000..48e2c76029d --- /dev/null +++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js @@ -0,0 +1,12 @@ +const emailRegex = new RegExp( "[^\\s@]+@[^\\s@]+\\.[^\\s@]+", "igm" ); + +/** + * Removes email addresses from a text. + * + * @param {string} text The text to remove emails from. + * + * @returns {string} The text without email addresses. + */ +export default function( text ) { + return text.replace( emailRegex, "" ); +} From 89957a081d06a9e9c359475b9797a77e63c5923e Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Mon, 7 Nov 2022 13:58:14 +0100 Subject: [PATCH 04/14] Remove email addresses from the text before retrieving the prominent words --- .../researches/getProminentWordsForInsights.js | 6 +++++- .../researches/getProminentWordsForInternalLinking.js | 7 ++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js index 7ba268d0076..0079fb5442b 100644 --- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js +++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js @@ -7,6 +7,7 @@ import { sortProminentWords, } from "../helpers/prominentWords/determineProminentWords"; import removeURLs from "../helpers/sanitize/removeURLs.js"; +import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses"; /** * Retrieves the prominent words from the given paper. @@ -24,7 +25,10 @@ function getProminentWordsForInsights( paper, researcher ) { // An optional custom helper to get words from the text. const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" ); - const text = removeURLs( paper.getText() ); + let text = paper.getText(); + // We don't want to include URLs or email addresses in prominent words. + text = removeURLs( text ); + text = removeEmailAddresses( text ); // If the language has a custom helper to get words from the text, we don't retrieve the abbreviation. const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text ); diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js index 560664a4743..66564bbe6fa 100644 --- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js +++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js @@ -11,6 +11,7 @@ import { import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings"; import baseStemmer from "../helpers/morphology/baseStemmer"; import removeURLs from "../helpers/sanitize/removeURLs.js"; +import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses"; /** * Retrieves the prominent words from the given paper. @@ -33,7 +34,11 @@ function getProminentWordsForInternalLinking( paper, researcher ) { // An optional custom helper to count length to use instead of countWords. const customCountLength = researcher.getHelper( "customCountLength" ); - const text = removeURLs( paper.getText() ); + let text = paper.getText(); + // We don't want to include URLs or email addresses in prominent words. + text = removeURLs( text ); + text = removeEmailAddresses( text ); + const metadescription = removeURLs( paper.getDescription() ); const title = removeURLs( paper.getTitle() ); From d4ff1e7794c73f712fc929e378d5efb176eb0fad Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Mon, 7 Nov 2022 17:03:32 +0100 Subject: [PATCH 05/14] Change the regex so that it doesn't match URLs that don't start with http(s), ftp, or www For example 'yoast.com' - this can be used as a word, not necessarily a link, like "There was so much traffic on yoast.com today" --- .../src/languageProcessing/helpers/sanitize/removeURLs.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js index 61315352a06..727fbc0237f 100644 --- a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js +++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js @@ -1,6 +1,5 @@ -const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b" + - "([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" ); - +const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" + + "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" ); /** * Removes URLs from a text. * From 63df888afa8c6cb40b8f604b5ac0ce4096aaf0fb Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Mon, 7 Nov 2022 17:03:48 +0100 Subject: [PATCH 06/14] Add more unit tests --- .../sanitize/removeEmailAddressesSpec.js | 19 +++++++++++++++++ .../helpers/sanitize/removeURLsSpec.js | 19 +++++++++++++++-- .../ja/helpers/countCharactersSpec.js | 21 +------------------ 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js index e69de29bb2d..31178173848 100644 --- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js +++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js @@ -0,0 +1,19 @@ +import removeEmailAddresses from "../../../../src/languageProcessing/helpers/sanitize/removeEmailAddresses.js"; + +describe( "a test for removing email addresses from a string", function() { + it( "removes an email address", function() { + expect( removeEmailAddresses( "example@something.com" ) ).toBe( "" ); + } ); + it( "removes an email address with special characters", function() { + expect( removeEmailAddresses( "some+long+email+address23@some+host-weird-/looking.com" ) ).toBe( "" ); + } ); + it( "removes a very short email address", function() { + expect( removeEmailAddresses( "a@b.com" ) ).toBe( "" ); + } ); + it( "does not remove invalid email addresses", function() { + expect( removeEmailAddresses( "@b.com" ) ).toBe( "@b.com" ); + expect( removeEmailAddresses( "a@b" ) ).toBe( "a@b" ); + expect( removeEmailAddresses( "example@" ) ).toBe( "example@" ); + expect( removeEmailAddresses( "example.com" ) ).toBe( "example.com" ); + } ); +} ); diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js index e69b6eace70..2ec9429357a 100644 --- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js +++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js @@ -19,6 +19,12 @@ describe( "a test for removing URLs from a string", function() { it( "removes a URL containing www.", function() { expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" ); } ); + it( "removes a URL starting with www.", function() { + expect( removeURLs( "www.blog.example.com/examples" ) ).toBe( "" ); + } ); + it( "removes a URL starting with ftp", function() { + expect( removeURLs( "ftp://example.com" ) ).toBe( "" ); + } ); it( "removes a URL containing special characters.", function() { expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" ); } ); @@ -28,7 +34,16 @@ describe( "a test for removing URLs from a string", function() { it( "removes a URL with a different top-level domain", function() { expect( removeURLs( "http://example.co.uk" ) ).toBe( "" ); } ); - it( "does not remove a string if it doesn't start with http(s)://", function() { - expect( removeURLs( "example.com" ) ).toBe( "blog.example.com/examples" ); + it( "removes a URL followed by Japanese characters", function() { + expect( removeURLs( "https://example.comこれに対し日本国有鉄道" ) ).toBe( "これに対し日本国有鉄道" ); + } ); + it( "does not remove a URL that doesn't start with 'http(s)://', 'ftp://' or 'www'.", function() { + expect( removeURLs( "example.com" ) ).toBe( "example.com" ); + } ); + it( "does not remove https:// on its own", function() { + expect( removeURLs( "https://" ) ).toBe( "https://" ); + } ); + it( "does not remove a URL without a top-level domain", function() { + expect( removeURLs( "https://example" ) ).toBe( "https://example" ); } ); } ); diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js index 69cc0a18e6e..fb5bf8fbbc5 100644 --- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js +++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js @@ -38,7 +38,7 @@ describe( "counts characters in a string", function() { "本作品を歌うことは原則上はできなかった。

"; expect( countCharactersFunction( text ) ).toBe( 757 ); } ); - it( "makes sure that no charachters are counted when a URL is embedded in video tags", function() { + it( "makes sure that no characters are counted when a URL is embedded in video tags", function() { const text = "\n" + "\t\t\t
{ - it( "Returns true if there is a match against a URL starting with www", () => { - // eslint-disable-next-line max-len - expect( doesWordMatchRegex( "www.yoast.com", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true ); - } ); - it( "Returns true if there is a match against a URL starting with https", () => { - // eslint-disable-next-line max-len - expect( doesWordMatchRegex( "https://www.codecademy.com/learn/hello", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true ); - } ); - it( "Returns true if there is a match against a URL starting with http", () => { - // eslint-disable-next-line max-len - expect( doesWordMatchRegex( "http://foo.com/blah_blah/", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true ); - } ); - it( "Returns false if there is no match", () => { - // eslint-disable-next-line max-len - expect( doesWordMatchRegex( "My cat is sweet!", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( false ); - } ); -} ); From 32c1ec6ed66e08da76a543f7dbd5608fa4d47851 Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Mon, 7 Nov 2022 17:39:12 +0100 Subject: [PATCH 07/14] Fix code style --- .../ja/helpers/countCharactersSpec.js | 1 - .../getProminentWordsForInternalLinking.js | 22 +++++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js index fb5bf8fbbc5..f154c85be5c 100644 --- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js +++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js @@ -1,4 +1,3 @@ -import { doesWordMatchRegex } from "../../../../../src/languageProcessing/helpers/morphology/regexHelpers"; import countCharactersFunction from "../../../../../src/languageProcessing/languages/ja/helpers/countCharacters.js"; describe( "counts characters in a string", function() { diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js index 66564bbe6fa..1894fc44ff7 100644 --- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js +++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js @@ -13,6 +13,18 @@ import baseStemmer from "../helpers/morphology/baseStemmer"; import removeURLs from "../helpers/sanitize/removeURLs.js"; import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses"; +/** + * Removes URLs and email addresses from the text. + * + * @param {string} text The text to sanitize. + * + * @returns {string} The text without URLs and email addresses. + */ +const sanitizeText = function( text ) { + text = removeURLs( text ); + return removeEmailAddresses( text ); +}; + /** * Retrieves the prominent words from the given paper. * @@ -34,13 +46,9 @@ function getProminentWordsForInternalLinking( paper, researcher ) { // An optional custom helper to count length to use instead of countWords. const customCountLength = researcher.getHelper( "customCountLength" ); - let text = paper.getText(); - // We don't want to include URLs or email addresses in prominent words. - text = removeURLs( text ); - text = removeEmailAddresses( text ); - - const metadescription = removeURLs( paper.getDescription() ); - const title = removeURLs( paper.getTitle() ); + const text = sanitizeText( paper.getText() ); + const metadescription = sanitizeText( paper.getDescription() ); + const title = sanitizeText( paper.getTitle() ); const result = {}; result.hasMetaDescription = metadescription !== ""; From e1ee7760a4e26436d1fcf6ae3eadb4aa60df5e74 Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Tue, 8 Nov 2022 10:48:27 +0100 Subject: [PATCH 08/14] Add unit tests to getProminentWordsForInternalLinkingSpec --- ...getProminentWordsForInternalLinkingSpec.js | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js index 8bb8b2df8cf..0b72dc562bc 100644 --- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js +++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js @@ -381,5 +381,57 @@ describe( "test for prominent words research for languages that have custom help expect( words ).toEqual( expected ); } ); + + it( "does not count URLs and email addresses as prominent words", function() { + const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ), + { title: "example@something.com example@something.com example@something.com" } ); + + const researcher = new Researcher( paper ); + + const expected = { + prominentWords: [], + hasMetaDescription: false, + hasTitle: true, + }; + + const words = prominentWordsResearch( paper, researcher ); + + expect( words ).toEqual( expected ); + } ); + + it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" + + "than 100 words when they are excluded", function() { + const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) + + " cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } ); + + const researcher = new Researcher( paper ); + + const expected = { + prominentWords: [], + hasMetaDescription: false, + hasTitle: true, + }; + + const words = prominentWordsResearch( paper, researcher ); + + expect( words ).toEqual( expected ); + } ); + + it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() { + const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) + + " cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } ); + + const researcher = new Researcher( paper ); + + const expected = { + prominentWords: [ new ProminentWord( "cats", "cats", 101 ) ], + hasMetaDescription: false, + hasTitle: true, + }; + + const words = prominentWordsResearch( paper, researcher ); + + expect( words ).toEqual( expected ); + } ); } ); From 7011f0b816e567a057ffa005ce9acd720c8a544c Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Tue, 8 Nov 2022 12:35:57 +0100 Subject: [PATCH 09/14] Adds a semi-colon to the characters that can appear after the top-level domain This was added as a fix to WP editor converting & to & --- .../languageProcessing/helpers/sanitize/removeURLsSpec.js | 3 +++ .../src/languageProcessing/helpers/sanitize/removeURLs.js | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js index 2ec9429357a..4e084f79ef0 100644 --- a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js +++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js @@ -28,6 +28,9 @@ describe( "a test for removing URLs from a string", function() { it( "removes a URL containing special characters.", function() { expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" ); } ); + it( "removes a URL containing a semi-colon.", function() { + expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" ); + } ); it( "removes a URL containing more special characters.", function() { expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" ); } ); diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js index 727fbc0237f..c83b006a345 100644 --- a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js +++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js @@ -1,5 +1,5 @@ -const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" + - "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" ); +const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" + + "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" ); /** * Removes URLs from a text. * From e60b3c7d92ccdedd9482f9303d9fd6a1b0642e95 Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Tue, 8 Nov 2022 16:56:22 +0100 Subject: [PATCH 10/14] Add unit tests for prominent words for insights research --- .../getProminentWordsForInsightsSpec.js | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js index 16e1b189079..646885a69a5 100644 --- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js +++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js @@ -175,3 +175,42 @@ describe( "test for prominent words research for languages that have custom help expect( words ).toEqual( expected ); } ); } ); + +describe( "test for filtering out of URLs and email addresses", function() { + it( "does not include URLs in prominent words", function() { + const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) ); + + const researcher = new Researcher( paper ); + researcher.addResearchData( "morphology", morphologyData ); + + const words = getProminentWordsForInsights( paper, researcher ); + expect( words ).toEqual( [ + new ProminentWord( "cats", "cat", 50 ), + ] ); + } ); + + it( "does not include email addresses in prominent words", function() { + const paper = new Paper( "example89@something.com ".repeat( 180 ) + "cats ".repeat( 50 ) ); + + const researcher = new Researcher( paper ); + researcher.addResearchData( "morphology", morphologyData ); + + const words = getProminentWordsForInsights( paper, researcher ); + expect( words ).toEqual( [ + new ProminentWord( "cats", "cat", 50 ), + ] ); + } ); + + it( "includes domain names in prominent words", function() { + const paper = new Paper( "example.com ".repeat( 180 ) + "cats ".repeat( 50 ) ); + + const researcher = new Researcher( paper ); + researcher.addResearchData( "morphology", morphologyData ); + + const words = getProminentWordsForInsights( paper, researcher ); + expect( words ).toEqual( [ + new ProminentWord( "example.com", "example.com", 180 ), + new ProminentWord( "cats", "cat", 50 ), + ] ); + } ); +} ); From 1e2a7e6938978b0a48f82f1edc750a5dc3a0f9ef Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Tue, 8 Nov 2022 16:56:43 +0100 Subject: [PATCH 11/14] Adjust prominent words for internal linking research unit tests --- ...getProminentWordsForInternalLinkingSpec.js | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js index 0b72dc562bc..fe232aaa901 100644 --- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js +++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js @@ -383,7 +383,7 @@ describe( "test for prominent words research for languages that have custom help } ); it( "does not count URLs and email addresses as prominent words", function() { - const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ), + const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ), { title: "example@something.com example@something.com example@something.com" } ); const researcher = new Researcher( paper ); @@ -399,9 +399,25 @@ describe( "test for prominent words research for languages that have custom help expect( words ).toEqual( expected ); } ); + it( "counts domain names as prominent words", function() { + const paper = new Paper( "yoast.com ".repeat( 180 ) ); + + const researcher = new Researcher( paper ); + + const expected = { + prominentWords: [ new ProminentWord( "yoast.com", "yoast.com", 180 ) ], + hasMetaDescription: false, + hasTitle: false, + }; + + const words = prominentWordsResearch( paper, researcher ); + + expect( words ).toEqual( expected ); + } ); + it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" + "than 100 words when they are excluded", function() { - const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) + + const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) + " cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } ); const researcher = new Researcher( paper ); @@ -418,7 +434,7 @@ describe( "test for prominent words research for languages that have custom help } ); it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() { - const paper = new Paper( "http://blog.example.com/examples".repeat( 180 ) + " example@something.com".repeat( 180 ) + + const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) + " cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } ); const researcher = new Researcher( paper ); From 43811c8f1a975736a668c7378cc044eb9f43baab Mon Sep 17 00:00:00 2001 From: Marina Koleva Date: Thu, 10 Nov 2022 13:52:04 +0100 Subject: [PATCH 12/14] adding a comment to removeURLs.js --- .../researches/getProminentWordsForInsightsSpec.js | 2 +- .../src/languageProcessing/helpers/sanitize/removeURLs.js | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js index 646885a69a5..19a9b89048a 100644 --- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js +++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js @@ -176,7 +176,7 @@ describe( "test for prominent words research for languages that have custom help } ); } ); -describe( "test for filtering out of URLs and email addresses", function() { +describe( "test for filtering out URLs and email addresses", function() { it( "does not include URLs in prominent words", function() { const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) ); diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js index c83b006a345..309e222ab74 100644 --- a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js +++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js @@ -1,7 +1,8 @@ const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" + "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" ); /** - * Removes URLs from a text. + * This regex is used to match URLs, whether they are embedded in tags or not. + * It doesn't match domain names (e.g. "yoast.com" in "We got so much traffic on yoast.com after the latest release"). * * @param {string} text The text to remove URLs from. * From cc3179c6ef0f345a8dd31ec61ad5d8f71e5516e2 Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Tue, 29 Nov 2022 16:38:25 +0100 Subject: [PATCH 13/14] Remove spaces from Japanese text before counting characters --- .../languages/ja/helpers/countCharacters.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js index 782d58371dd..5c2cec887af 100644 --- a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js +++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js @@ -3,7 +3,9 @@ const { sanitizeString } = languageProcessing; import removeURLs from "../../../helpers/sanitize/removeURLs.js"; /** - * Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text. + * Calculates the character count which serves as a measure of text length. + * The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, spaces, and the + * content of the Table of Contents and Estimated Reading Time blocks. * * @param {string} text The text to be counted. * @@ -12,6 +14,7 @@ import removeURLs from "../../../helpers/sanitize/removeURLs.js"; export default function( text ) { text = removeURLs( text ); text = sanitizeString( text ); + text = text.replace( /\s/g, "" ); return text.length; } From f940c11051b6a9892fb52f6666aa113a034df603 Mon Sep 17 00:00:00 2001 From: Agnieszka Szuba Date: Tue, 29 Nov 2022 17:24:55 +0100 Subject: [PATCH 14/14] Fix unit tests --- .../spec/fullTextTests/testTexts/ja/japanesePaper.js | 2 +- .../helpers/sentence/sentencesLengthSpec.js | 2 +- .../languages/ja/helpers/countCharactersSpec.js | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js index 915451bc5df..39d9f79129a 100644 --- a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js +++ b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js @@ -66,7 +66,7 @@ const expectedResults = { textLength: { isApplicable: true, score: 9, - resultText: "Text length: The text contains 3165 characters. Good job!", + resultText: "Text length: The text contains 3022 characters. Good job!", }, externalLinks: { isApplicable: true, diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js index 5a83c81d57c..0b50b8e7b7b 100644 --- a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js +++ b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js @@ -35,7 +35,7 @@ describe( "A test to count sentence lengths.", function() { expect( lengths ).toEqual( [ { sentence: "自然おのずから存在しているもの", sentenceLength: 15 }, - { sentence: "歩くさわやかな森 自然 ", sentenceLength: 11 }, + { sentence: "歩くさわやかな森 自然 ", sentenceLength: 10 }, ] ); } ); } ); diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js index f154c85be5c..a601da67476 100644 --- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js +++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js @@ -12,9 +12,9 @@ describe( "counts characters in a string", function() { "東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 ); } ); it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() { - expect( countCharactersFunction( "this is a string" ) ).toBe( 16 ); + expect( countCharactersFunction( "this is a string" ) ).toBe( 13 ); expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " + - "(представляващи краен брой знаци)." ) ).toBe( 90 ); + "(представляващи краен брой знаци)." ) ).toBe( 78 ); } ); it( "makes sure that the table of contents is excluded from the calculation", function() { const text = "

目次

  • 戦後においては一般に広義の童謡にカテゴライズされる本作品は、" + "初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" + "本作品を歌うことは原則上はできなかった。

    "; - expect( countCharactersFunction( text ) ).toBe( 757 ); + expect( countCharactersFunction( text ) ).toBe( 744 ); } ); it( "makes sure that no characters are counted when a URL is embedded in video tags", function() { const text = "