diff --git a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
index 915451bc5df..39d9f79129a 100644
--- a/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
+++ b/packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js
@@ -66,7 +66,7 @@ const expectedResults = {
textLength: {
isApplicable: true,
score: 9,
- resultText: "Text length: The text contains 3165 characters. Good job!",
+ resultText: "Text length: The text contains 3022 characters. Good job!",
},
externalLinks: {
isApplicable: true,
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
new file mode 100644
index 00000000000..31178173848
--- /dev/null
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeEmailAddressesSpec.js
@@ -0,0 +1,19 @@
+import removeEmailAddresses from "../../../../src/languageProcessing/helpers/sanitize/removeEmailAddresses.js";
+
+describe( "a test for removing email addresses from a string", function() {
+ it( "removes an email address", function() {
+ expect( removeEmailAddresses( "example@something.com" ) ).toBe( "" );
+ } );
+ it( "removes an email address with special characters", function() {
+ expect( removeEmailAddresses( "some+long+email+address23@some+host-weird-/looking.com" ) ).toBe( "" );
+ } );
+ it( "removes a very short email address", function() {
+ expect( removeEmailAddresses( "a@b.com" ) ).toBe( "" );
+ } );
+ it( "does not remove invalid email addresses", function() {
+ expect( removeEmailAddresses( "@b.com" ) ).toBe( "@b.com" );
+ expect( removeEmailAddresses( "a@b" ) ).toBe( "a@b" );
+ expect( removeEmailAddresses( "example@" ) ).toBe( "example@" );
+ expect( removeEmailAddresses( "example.com" ) ).toBe( "example.com" );
+ } );
+} );
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
new file mode 100644
index 00000000000..4e084f79ef0
--- /dev/null
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sanitize/removeURLsSpec.js
@@ -0,0 +1,52 @@
+import removeURLs from "../../../../src/languageProcessing/helpers/sanitize/removeURLs.js";
+
+describe( "a test for removing URLs from a string", function() {
+ it( "removes a base URL", function() {
+ expect( removeURLs( "https://example.com" ) ).toBe( "" );
+ } );
+ it( "removes a URL followed by a subdirectory", function() {
+ expect( removeURLs( "https://example.com/example1" ) ).toBe( "" );
+ } );
+ it( "removes a URL followed by multiple subdirectories", function() {
+ expect( removeURLs( "https://example.com/example1/part1" ) ).toBe( "" );
+ } );
+ it( "removes a URL with a subdomain", function() {
+ expect( removeURLs( "https://blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL starting with http://", function() {
+ expect( removeURLs( "http://blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL containing www.", function() {
+ expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL starting with www.", function() {
+ expect( removeURLs( "www.blog.example.com/examples" ) ).toBe( "" );
+ } );
+ it( "removes a URL starting with ftp", function() {
+ expect( removeURLs( "ftp://example.com" ) ).toBe( "" );
+ } );
+ it( "removes a URL containing special characters.", function() {
+ expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
+ } );
+ it( "removes a URL containing a semi-colon.", function() {
+ expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
+ } );
+ it( "removes a URL containing more special characters.", function() {
+ expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" );
+ } );
+ it( "removes a URL with a different top-level domain", function() {
+ expect( removeURLs( "http://example.co.uk" ) ).toBe( "" );
+ } );
+ it( "removes a URL followed by Japanese characters", function() {
+ expect( removeURLs( "https://example.comこれに対し日本国有鉄道" ) ).toBe( "これに対し日本国有鉄道" );
+ } );
+ it( "does not remove a URL that doesn't start with 'http(s)://', 'ftp://' or 'www'.", function() {
+ expect( removeURLs( "example.com" ) ).toBe( "example.com" );
+ } );
+ it( "does not remove https:// on its own", function() {
+ expect( removeURLs( "https://" ) ).toBe( "https://" );
+ } );
+ it( "does not remove a URL without a top-level domain", function() {
+ expect( removeURLs( "https://example" ) ).toBe( "https://example" );
+ } );
+} );
diff --git a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
index 5a83c81d57c..0b50b8e7b7b 100644
--- a/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js
@@ -35,7 +35,7 @@ describe( "A test to count sentence lengths.", function() {
expect( lengths ).toEqual( [
{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
- { sentence: "歩くさわやかな森 自然 ", sentenceLength: 11 },
+ { sentence: "歩くさわやかな森 自然 ", sentenceLength: 10 },
] );
} );
} );
diff --git a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
index 69cc0a18e6e..a601da67476 100644
--- a/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js
@@ -1,4 +1,3 @@
-import { doesWordMatchRegex } from "../../../../../src/languageProcessing/helpers/morphology/regexHelpers";
import countCharactersFunction from "../../../../../src/languageProcessing/languages/ja/helpers/countCharacters.js";
describe( "counts characters in a string", function() {
@@ -13,9 +12,9 @@ describe( "counts characters in a string", function() {
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
} );
it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() {
- expect( countCharactersFunction( "this is a string" ) ).toBe( 16 );
+ expect( countCharactersFunction( "this is a string" ) ).toBe( 13 );
expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " +
- "(представляващи краен брой знаци)." ) ).toBe( 90 );
+ "(представляващи краен брой знаци)." ) ).toBe( 78 );
} );
it( "makes sure that the table of contents is excluded from the calculation", function() {
const text = "
目次
戦後においては一般に広義の童謡にカテゴライズされる本作品は、" +
"初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" +
"本作品を歌うことは原則上はできなかった。";
- expect( countCharactersFunction( text ) ).toBe( 757 );
+ expect( countCharactersFunction( text ) ).toBe( 744 );
} );
- it( "makes sure that no charachters are counted when a URL is embedded in video tags", function() {
+ it( "makes sure that no characters are counted when a URL is embedded in video tags", function() {
const text = "\n" +
"\t\t\t {
- it( "Returns true if there is a match against a URL starting with www", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "www.yoast.com", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
- } );
- it( "Returns true if there is a match against a URL starting with https", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "https://www.codecademy.com/learn/hello", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
- } );
- it( "Returns true if there is a match against a URL starting with http", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "http://foo.com/blah_blah/", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
- } );
- it( "Returns false if there is no match", () => {
- // eslint-disable-next-line max-len
- expect( doesWordMatchRegex( "My cat is sweet!", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( false );
- } );
-} );
diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
index 16e1b189079..19a9b89048a 100644
--- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js
@@ -175,3 +175,42 @@ describe( "test for prominent words research for languages that have custom help
expect( words ).toEqual( expected );
} );
} );
+
+describe( "test for filtering out URLs and email addresses", function() {
+ it( "does not include URLs in prominent words", function() {
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+ const researcher = new Researcher( paper );
+ researcher.addResearchData( "morphology", morphologyData );
+
+ const words = getProminentWordsForInsights( paper, researcher );
+ expect( words ).toEqual( [
+ new ProminentWord( "cats", "cat", 50 ),
+ ] );
+ } );
+
+ it( "does not include email addresses in prominent words", function() {
+ const paper = new Paper( "example89@something.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+ const researcher = new Researcher( paper );
+ researcher.addResearchData( "morphology", morphologyData );
+
+ const words = getProminentWordsForInsights( paper, researcher );
+ expect( words ).toEqual( [
+ new ProminentWord( "cats", "cat", 50 ),
+ ] );
+ } );
+
+ it( "includes domain names in prominent words", function() {
+ const paper = new Paper( "example.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
+
+ const researcher = new Researcher( paper );
+ researcher.addResearchData( "morphology", morphologyData );
+
+ const words = getProminentWordsForInsights( paper, researcher );
+ expect( words ).toEqual( [
+ new ProminentWord( "example.com", "example.com", 180 ),
+ new ProminentWord( "cats", "cat", 50 ),
+ ] );
+ } );
+} );
diff --git a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
index 8bb8b2df8cf..fe232aaa901 100644
--- a/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
+++ b/packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js
@@ -381,5 +381,73 @@ describe( "test for prominent words research for languages that have custom help
expect( words ).toEqual( expected );
} );
+
+ it( "does not count URLs and email addresses as prominent words", function() {
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ),
+ { title: "example@something.com example@something.com example@something.com" } );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [],
+ hasMetaDescription: false,
+ hasTitle: true,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
+
+ it( "counts domain names as prominent words", function() {
+ const paper = new Paper( "yoast.com ".repeat( 180 ) );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [ new ProminentWord( "yoast.com", "yoast.com", 180 ) ],
+ hasMetaDescription: false,
+ hasTitle: false,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
+
+ it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" +
+ "than 100 words when they are excluded", function() {
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
+ " cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [],
+ hasMetaDescription: false,
+ hasTitle: true,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
+
+ it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() {
+ const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
+ " cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } );
+
+ const researcher = new Researcher( paper );
+
+ const expected = {
+ prominentWords: [ new ProminentWord( "cats", "cats", 101 ) ],
+ hasMetaDescription: false,
+ hasTitle: true,
+ };
+
+ const words = prominentWordsResearch( paper, researcher );
+
+ expect( words ).toEqual( expected );
+ } );
} );
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js
new file mode 100644
index 00000000000..48e2c76029d
--- /dev/null
+++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeEmailAddresses.js
@@ -0,0 +1,12 @@
+const emailRegex = new RegExp( "[^\\s@]+@[^\\s@]+\\.[^\\s@]+", "igm" );
+
+/**
+ * Removes email addresses from a text.
+ *
+ * @param {string} text The text to remove emails from.
+ *
+ * @returns {string} The text without email addresses.
+ */
+export default function( text ) {
+ return text.replace( emailRegex, "" );
+}
diff --git a/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
new file mode 100644
index 00000000000..309e222ab74
--- /dev/null
+++ b/packages/yoastseo/src/languageProcessing/helpers/sanitize/removeURLs.js
@@ -0,0 +1,13 @@
+const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" +
+ "|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" );
+/**
+ * This regex is used to match URLs, whether they are embedded in tags or not.
+ * It doesn't match domain names (e.g. "yoast.com" in "We got so much traffic on yoast.com after the latest release").
+ *
+ * @param {string} text The text to remove URLs from.
+ *
+ * @returns {string} The text without URLs.
+ */
+export default function( text ) {
+ return text.replace( urlRegex, "" );
+}
diff --git a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
index b18ce6d84e6..5c2cec887af 100644
--- a/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
+++ b/packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js
@@ -1,19 +1,20 @@
import { languageProcessing } from "yoastseo";
const { sanitizeString } = languageProcessing;
+import removeURLs from "../../../helpers/sanitize/removeURLs.js";
/**
- * Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text.
+ * Calculates the character count which serves as a measure of text length.
+ * The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, spaces, and the
+ * content of the Table of Contents and Estimated Reading Time blocks.
*
* @param {string} text The text to be counted.
*
* @returns {number} The character count of the given text.
*/
export default function( text ) {
- // This regex is used to match URLs in the text, either embedded in tags or not, so that they are excluded from the characters count.
- // eslint-disable-next-line max-len
- const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
- text = text.replace( urlRegex, "" );
+ text = removeURLs( text );
text = sanitizeString( text );
+ text = text.replace( /\s/g, "" );
return text.length;
}
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
index 7e994d648e9..0079fb5442b 100644
--- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
+++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInsights.js
@@ -6,6 +6,8 @@ import {
retrieveAbbreviations,
sortProminentWords,
} from "../helpers/prominentWords/determineProminentWords";
+import removeURLs from "../helpers/sanitize/removeURLs.js";
+import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
/**
* Retrieves the prominent words from the given paper.
@@ -23,7 +25,10 @@ function getProminentWordsForInsights( paper, researcher ) {
// An optional custom helper to get words from the text.
const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" );
- const text = paper.getText();
+ let text = paper.getText();
+ // We don't want to include URLs or email addresses in prominent words.
+ text = removeURLs( text );
+ text = removeEmailAddresses( text );
// If the language has a custom helper to get words from the text, we don't retrieve the abbreviation.
const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text );
diff --git a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
index 46faefc1b4e..1894fc44ff7 100644
--- a/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
+++ b/packages/yoastseo/src/languageProcessing/researches/getProminentWordsForInternalLinking.js
@@ -10,6 +10,20 @@ import {
} from "../helpers/prominentWords/determineProminentWords";
import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings";
import baseStemmer from "../helpers/morphology/baseStemmer";
+import removeURLs from "../helpers/sanitize/removeURLs.js";
+import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
+
+/**
+ * Removes URLs and email addresses from the text.
+ *
+ * @param {string} text The text to sanitize.
+ *
+ * @returns {string} The text without URLs and email addresses.
+ */
+const sanitizeText = function( text ) {
+ text = removeURLs( text );
+ return removeEmailAddresses( text );
+};
/**
* Retrieves the prominent words from the given paper.
@@ -32,9 +46,9 @@ function getProminentWordsForInternalLinking( paper, researcher ) {
// An optional custom helper to count length to use instead of countWords.
const customCountLength = researcher.getHelper( "customCountLength" );
- const text = paper.getText();
- const metadescription = paper.getDescription();
- const title = paper.getTitle();
+ const text = sanitizeText( paper.getText() );
+ const metadescription = sanitizeText( paper.getDescription() );
+ const title = sanitizeText( paper.getTitle() );
const result = {};
result.hasMetaDescription = metadescription !== "";