Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
31262ae
Exclude links from prominent words
agnieszkaszuba Nov 3, 2022
a331571
Add unit tests
agnieszkaszuba Nov 7, 2022
8c9d663
Add helper for removing email addresses
agnieszkaszuba Nov 7, 2022
89957a0
Remove email addresses from the text before retrieving the prominent …
agnieszkaszuba Nov 7, 2022
d4ff1e7
Change the regex so that it doesn't match URLs that don't start with …
agnieszkaszuba Nov 7, 2022
63df888
Add more unit tests
agnieszkaszuba Nov 7, 2022
32c1ec6
Fix code style
agnieszkaszuba Nov 7, 2022
e1ee776
Add unit tests to getProminentWordsForInternalLinkingSpec
agnieszkaszuba Nov 8, 2022
7011f0b
Adds a semi-colon to the characters that can appear after the top-lev…
agnieszkaszuba Nov 8, 2022
e60b3c7
Add unit tests for prominent words for insights research
agnieszkaszuba Nov 8, 2022
1e2a7e6
Adjust prominent words for internal linking research unit tests
agnieszkaszuba Nov 8, 2022
c3b2c9b
Merge branch 'trunk' of https://github.com/Yoast/wordpress-seo into P…
marinakoleva Nov 10, 2022
a482a1a
Merge branch 'PC-865-exclude-links-from-prominent-words' of github.co…
agnieszkaszuba Nov 10, 2022
43811c8
adding a comment to removeURLs.js
marinakoleva Nov 10, 2022
2a3efa5
Merge branch 'trunk' of https://github.com/Yoast/wordpress-seo into P…
marinakoleva Nov 14, 2022
d115093
Merge branch 'trunk' of github.com:Yoast/wordpress-seo into PC-865-ex…
iolse Nov 17, 2022
2014f76
Merge branch 'trunk' of github.com:Yoast/wordpress-seo into PC-865-ex…
agnieszkaszuba Nov 28, 2022
cc3179c
Remove spaces from Japanese text before counting characters
agnieszkaszuba Nov 29, 2022
f940c11
Fix unit tests
agnieszkaszuba Nov 29, 2022
8ccdf72
Merge branch 'trunk' of github.com:Yoast/wordpress-seo into PC-865-ex…
FAMarfuaty Dec 5, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ const expectedResults = {
textLength: {
isApplicable: true,
score: 9,
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3165 characters. Good job!",
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3022 characters. Good job!",
},
externalLinks: {
isApplicable: true,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import removeEmailAddresses from "../../../../src/languageProcessing/helpers/sanitize/removeEmailAddresses.js";

describe( "a test for removing email addresses from a string", function() {
it( "removes an email address", function() {
expect( removeEmailAddresses( "example@something.com" ) ).toBe( "" );
} );
it( "removes an email address with special characters", function() {
expect( removeEmailAddresses( "some+long+email+address23@some+host-weird-/looking.com" ) ).toBe( "" );
} );
it( "removes a very short email address", function() {
expect( removeEmailAddresses( "a@b.com" ) ).toBe( "" );
} );
it( "does not remove invalid email addresses", function() {
expect( removeEmailAddresses( "@b.com" ) ).toBe( "@b.com" );
expect( removeEmailAddresses( "a@b" ) ).toBe( "a@b" );
expect( removeEmailAddresses( "example@" ) ).toBe( "example@" );
expect( removeEmailAddresses( "example.com" ) ).toBe( "example.com" );
} );
} );
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import removeURLs from "../../../../src/languageProcessing/helpers/sanitize/removeURLs.js";

describe( "a test for removing URLs from a string", function() {
it( "removes a base URL", function() {
expect( removeURLs( "https://example.com" ) ).toBe( "" );
} );
it( "removes a URL followed by a subdirectory", function() {
expect( removeURLs( "https://example.com/example1" ) ).toBe( "" );
} );
it( "removes a URL followed by multiple subdirectories", function() {
expect( removeURLs( "https://example.com/example1/part1" ) ).toBe( "" );
} );
it( "removes a URL with a subdomain", function() {
expect( removeURLs( "https://blog.example.com/examples" ) ).toBe( "" );
} );
it( "removes a URL starting with http://", function() {
expect( removeURLs( "http://blog.example.com/examples" ) ).toBe( "" );
} );
it( "removes a URL containing www.", function() {
expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" );
} );
it( "removes a URL starting with www.", function() {
expect( removeURLs( "www.blog.example.com/examples" ) ).toBe( "" );
} );
it( "removes a URL starting with ftp", function() {
expect( removeURLs( "ftp://example.com" ) ).toBe( "" );
} );
it( "removes a URL containing special characters.", function() {
expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
} );
it( "removes a URL containing a semi-colon.", function() {
expect( removeURLs( "https://www.example.com/foo/?bar=baz&amp;inga=42&amp;quux" ) ).toBe( "" );
} );
it( "removes a URL containing more special characters.", function() {
expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" );
} );
it( "removes a URL with a different top-level domain", function() {
expect( removeURLs( "http://example.co.uk" ) ).toBe( "" );
} );
it( "removes a URL followed by Japanese characters", function() {
expect( removeURLs( "https://example.comこれに対し日本国有鉄道" ) ).toBe( "これに対し日本国有鉄道" );
} );
it( "does not remove a URL that doesn't start with 'http(s)://', 'ftp://' or 'www'.", function() {
expect( removeURLs( "example.com" ) ).toBe( "example.com" );
} );
it( "does not remove https:// on its own", function() {
expect( removeURLs( "https://" ) ).toBe( "https://" );
} );
it( "does not remove a URL without a top-level domain", function() {
expect( removeURLs( "https://example" ) ).toBe( "https://example" );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ describe( "A test to count sentence lengths.", function() {

expect( lengths ).toEqual( [
{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 11 },
{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 10 },
] );
} );
} );
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { doesWordMatchRegex } from "../../../../../src/languageProcessing/helpers/morphology/regexHelpers";
import countCharactersFunction from "../../../../../src/languageProcessing/languages/ja/helpers/countCharacters.js";

describe( "counts characters in a string", function() {
Expand All @@ -13,9 +12,9 @@ describe( "counts characters in a string", function() {
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
} );
it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() {
expect( countCharactersFunction( "this is a string" ) ).toBe( 16 );
expect( countCharactersFunction( "this is a string" ) ).toBe( 13 );
expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " +
"(представляващи краен брой знаци)." ) ).toBe( 90 );
"(представляващи краен брой знаци)." ) ).toBe( 78 );
} );
it( "makes sure that the table of contents is excluded from the calculation", function() {
const text = "<div class=\"wp-block-yoast-seo-table-of-contents yoast-table-of-contents\"><h2>目次</h2><ul><li><a " +
Expand All @@ -36,9 +35,9 @@ describe( "counts characters in a string", function() {
"<p>戦後においては一般に広義の<a href=\"https://ja.wikipedia.org/wiki/%E7%AB%A5%E8%AC%A1\">童謡</a>にカテゴライズされる本作品は、" +
"初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" +
"本作品を歌うことは原則上はできなかった。</p>";
expect( countCharactersFunction( text ) ).toBe( 757 );
expect( countCharactersFunction( text ) ).toBe( 744 );
} );
it( "makes sure that no charachters are counted when a URL is embedded in video tags", function() {
it( "makes sure that no characters are counted when a URL is embedded in video tags", function() {
const text = "<!-- wp:embed {\"url\":\"https://www.youtube.com/watch?v=cbP2N1BQdYc\",\"type\":\"video\"," +
"\"providerNameSlug\":\"youtube\",\"responsive\":true,\"className\":\"wp-embed-aspect-16-9 wp-has-aspect-ratio\"} -->\n" +
"\t\t\t<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect" +
Expand All @@ -48,22 +47,3 @@ describe( "counts characters in a string", function() {
expect( countCharactersFunction( text ) ).toBe( 0 );
} );
} );

describe( "A test to return a regex match for URLs", () => {
it( "Returns true if there is a match against a URL starting with www", () => {
// eslint-disable-next-line max-len
expect( doesWordMatchRegex( "www.yoast.com", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
} );
it( "Returns true if there is a match against a URL starting with https", () => {
// eslint-disable-next-line max-len
expect( doesWordMatchRegex( "https://www.codecademy.com/learn/hello", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
} );
it( "Returns true if there is a match against a URL starting with http", () => {
// eslint-disable-next-line max-len
expect( doesWordMatchRegex( "http://foo.com/blah_blah/", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
} );
it( "Returns false if there is no match", () => {
// eslint-disable-next-line max-len
expect( doesWordMatchRegex( "My cat is sweet!", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( false );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,42 @@ describe( "test for prominent words research for languages that have custom help
expect( words ).toEqual( expected );
} );
} );

describe( "test for filtering out URLs and email addresses", function() {
it( "does not include URLs in prominent words", function() {
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) );

const researcher = new Researcher( paper );
researcher.addResearchData( "morphology", morphologyData );

const words = getProminentWordsForInsights( paper, researcher );
expect( words ).toEqual( [
new ProminentWord( "cats", "cat", 50 ),
] );
} );

it( "does not include email addresses in prominent words", function() {
const paper = new Paper( "example89@something.com ".repeat( 180 ) + "cats ".repeat( 50 ) );

const researcher = new Researcher( paper );
researcher.addResearchData( "morphology", morphologyData );

const words = getProminentWordsForInsights( paper, researcher );
expect( words ).toEqual( [
new ProminentWord( "cats", "cat", 50 ),
] );
} );

it( "includes domain names in prominent words", function() {
const paper = new Paper( "example.com ".repeat( 180 ) + "cats ".repeat( 50 ) );

const researcher = new Researcher( paper );
researcher.addResearchData( "morphology", morphologyData );

const words = getProminentWordsForInsights( paper, researcher );
expect( words ).toEqual( [
new ProminentWord( "example.com", "example.com", 180 ),
new ProminentWord( "cats", "cat", 50 ),
] );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -381,5 +381,73 @@ describe( "test for prominent words research for languages that have custom help

expect( words ).toEqual( expected );
} );

it( "does not count URLs and email addresses as prominent words", function() {
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ),
{ title: "example@something.com example@something.com example@something.com" } );

const researcher = new Researcher( paper );

const expected = {
prominentWords: [],
hasMetaDescription: false,
hasTitle: true,
};

const words = prominentWordsResearch( paper, researcher );

expect( words ).toEqual( expected );
} );

it( "counts domain names as prominent words", function() {
const paper = new Paper( "yoast.com ".repeat( 180 ) );

const researcher = new Researcher( paper );

const expected = {
prominentWords: [ new ProminentWord( "yoast.com", "yoast.com", 180 ) ],
hasMetaDescription: false,
hasTitle: false,
};

const words = prominentWordsResearch( paper, researcher );

expect( words ).toEqual( expected );
} );

it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" +
"than 100 words when they are excluded", function() {
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
" cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } );

const researcher = new Researcher( paper );

const expected = {
prominentWords: [],
hasMetaDescription: false,
hasTitle: true,
};

const words = prominentWordsResearch( paper, researcher );

expect( words ).toEqual( expected );
} );

it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() {
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
" cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } );

const researcher = new Researcher( paper );

const expected = {
prominentWords: [ new ProminentWord( "cats", "cats", 101 ) ],
hasMetaDescription: false,
hasTitle: true,
};

const words = prominentWordsResearch( paper, researcher );

expect( words ).toEqual( expected );
} );
} );

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
const emailRegex = new RegExp( "[^\\s@]+@[^\\s@]+\\.[^\\s@]+", "igm" );

/**
* Removes email addresses from a text.
*
* @param {string} text The text to remove emails from.
*
* @returns {string} The text without email addresses.
*/
export default function( text ) {
return text.replace( emailRegex, "" );
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" +
"|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" );
/**
* This regex is used to match URLs, whether they are embedded in tags or not.
* It doesn't match domain names (e.g. "yoast.com" in "We got so much traffic on yoast.com after the latest release").
*
* @param {string} text The text to remove URLs from.
*
* @returns {string} The text without URLs.
*/
export default function( text ) {
return text.replace( urlRegex, "" );
}
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import { languageProcessing } from "yoastseo";
const { sanitizeString } = languageProcessing;
import removeURLs from "../../../helpers/sanitize/removeURLs.js";

/**
* Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text.
* Calculates the character count which serves as a measure of text length.
* The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, spaces, and the
* content of the Table of Contents and Estimated Reading Time blocks.
*
* @param {string} text The text to be counted.
*
* @returns {number} The character count of the given text.
*/
export default function( text ) {
// This regex is used to match URLs in the text, either embedded in tags or not, so that they are excluded from the characters count.
// eslint-disable-next-line max-len
const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
text = text.replace( urlRegex, "" );
text = removeURLs( text );
text = sanitizeString( text );
text = text.replace( /\s/g, "" );

return text.length;
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import {
retrieveAbbreviations,
sortProminentWords,
} from "../helpers/prominentWords/determineProminentWords";
import removeURLs from "../helpers/sanitize/removeURLs.js";
import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";

/**
* Retrieves the prominent words from the given paper.
Expand All @@ -23,7 +25,10 @@ function getProminentWordsForInsights( paper, researcher ) {
// An optional custom helper to get words from the text.
const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" );

const text = paper.getText();
let text = paper.getText();
// We don't want to include URLs or email addresses in prominent words.
text = removeURLs( text );
text = removeEmailAddresses( text );

// If the language has a custom helper to get words from the text, we don't retrieve the abbreviation.
const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ import {
} from "../helpers/prominentWords/determineProminentWords";
import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings";
import baseStemmer from "../helpers/morphology/baseStemmer";
import removeURLs from "../helpers/sanitize/removeURLs.js";
import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";

/**
* Removes URLs and email addresses from the text.
*
* @param {string} text The text to sanitize.
*
* @returns {string} The text without URLs and email addresses.
*/
const sanitizeText = function( text ) {
text = removeURLs( text );
return removeEmailAddresses( text );
};

/**
* Retrieves the prominent words from the given paper.
Expand All @@ -32,9 +46,9 @@ function getProminentWordsForInternalLinking( paper, researcher ) {
// An optional custom helper to count length to use instead of countWords.
const customCountLength = researcher.getHelper( "customCountLength" );

const text = paper.getText();
const metadescription = paper.getDescription();
const title = paper.getTitle();
const text = sanitizeText( paper.getText() );
const metadescription = sanitizeText( paper.getDescription() );
const title = sanitizeText( paper.getTitle() );

const result = {};
result.hasMetaDescription = metadescription !== "";
Expand Down