Skip to content

Commit df0429e

Browse files
authored
Merge pull request #19137 from Yoast/PC-865-exclude-links-from-prominent-words
Exclude links and email addresses from prominent words analysis
2 parents cd741f1 + 8ccdf72 commit df0429e

12 files changed

Lines changed: 238 additions & 35 deletions

File tree

packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ const expectedResults = {
6666
textLength: {
6767
isApplicable: true,
6868
score: 9,
69-
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3165 characters. Good job!",
69+
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3022 characters. Good job!",
7070
},
7171
externalLinks: {
7272
isApplicable: true,
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import removeEmailAddresses from "../../../../src/languageProcessing/helpers/sanitize/removeEmailAddresses.js";
2+
3+
describe( "a test for removing email addresses from a string", function() {
4+
it( "removes an email address", function() {
5+
expect( removeEmailAddresses( "example@something.com" ) ).toBe( "" );
6+
} );
7+
it( "removes an email address with special characters", function() {
8+
expect( removeEmailAddresses( "some+long+email+address23@some+host-weird-/looking.com" ) ).toBe( "" );
9+
} );
10+
it( "removes a very short email address", function() {
11+
expect( removeEmailAddresses( "a@b.com" ) ).toBe( "" );
12+
} );
13+
it( "does not remove invalid email addresses", function() {
14+
expect( removeEmailAddresses( "@b.com" ) ).toBe( "@b.com" );
15+
expect( removeEmailAddresses( "a@b" ) ).toBe( "a@b" );
16+
expect( removeEmailAddresses( "example@" ) ).toBe( "example@" );
17+
expect( removeEmailAddresses( "example.com" ) ).toBe( "example.com" );
18+
} );
19+
} );
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import removeURLs from "../../../../src/languageProcessing/helpers/sanitize/removeURLs.js";
2+
3+
describe( "a test for removing URLs from a string", function() {
4+
it( "removes a base URL", function() {
5+
expect( removeURLs( "https://example.com" ) ).toBe( "" );
6+
} );
7+
it( "removes a URL followed by a subdirectory", function() {
8+
expect( removeURLs( "https://example.com/example1" ) ).toBe( "" );
9+
} );
10+
it( "removes a URL followed by multiple subdirectories", function() {
11+
expect( removeURLs( "https://example.com/example1/part1" ) ).toBe( "" );
12+
} );
13+
it( "removes a URL with a subdomain", function() {
14+
expect( removeURLs( "https://blog.example.com/examples" ) ).toBe( "" );
15+
} );
16+
it( "removes a URL starting with http://", function() {
17+
expect( removeURLs( "http://blog.example.com/examples" ) ).toBe( "" );
18+
} );
19+
it( "removes a URL containing www.", function() {
20+
expect( removeURLs( "http://www.blog.example.com/examples" ) ).toBe( "" );
21+
} );
22+
it( "removes a URL starting with www.", function() {
23+
expect( removeURLs( "www.blog.example.com/examples" ) ).toBe( "" );
24+
} );
25+
it( "removes a URL starting with ftp", function() {
26+
expect( removeURLs( "ftp://example.com" ) ).toBe( "" );
27+
} );
28+
it( "removes a URL containing special characters.", function() {
29+
expect( removeURLs( "https://www.example.com/foo/?bar=baz&inga=42&quux" ) ).toBe( "" );
30+
} );
31+
it( "removes a URL containing a semi-colon.", function() {
32+
expect( removeURLs( "https://www.example.com/foo/?bar=baz&amp;inga=42&amp;quux" ) ).toBe( "" );
33+
} );
34+
it( "removes a URL containing more special characters.", function() {
35+
expect( removeURLs( "http://foo.com/blah_(wikipedia)_blah#cite-1" ) ).toBe( "" );
36+
} );
37+
it( "removes a URL with a different top-level domain", function() {
38+
expect( removeURLs( "http://example.co.uk" ) ).toBe( "" );
39+
} );
40+
it( "removes a URL followed by Japanese characters", function() {
41+
expect( removeURLs( "https://example.comこれに対し日本国有鉄道" ) ).toBe( "これに対し日本国有鉄道" );
42+
} );
43+
it( "does not remove a URL that doesn't start with 'http(s)://', 'ftp://' or 'www'.", function() {
44+
expect( removeURLs( "example.com" ) ).toBe( "example.com" );
45+
} );
46+
it( "does not remove https:// on its own", function() {
47+
expect( removeURLs( "https://" ) ).toBe( "https://" );
48+
} );
49+
it( "does not remove a URL without a top-level domain", function() {
50+
expect( removeURLs( "https://example" ) ).toBe( "https://example" );
51+
} );
52+
} );

packages/yoastseo/spec/languageProcessing/helpers/sentence/sentencesLengthSpec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ describe( "A test to count sentence lengths.", function() {
3535

3636
expect( lengths ).toEqual( [
3737
{ sentence: "自然おのずから存在しているもの", sentenceLength: 15 },
38-
{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 11 },
38+
{ sentence: "歩くさわやかな森 <span style='color: red;'> 自然 </span>", sentenceLength: 10 },
3939
] );
4040
} );
4141
} );
Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import { doesWordMatchRegex } from "../../../../../src/languageProcessing/helpers/morphology/regexHelpers";
21
import countCharactersFunction from "../../../../../src/languageProcessing/languages/ja/helpers/countCharacters.js";
32

43
describe( "counts characters in a string", function() {
@@ -13,9 +12,9 @@ describe( "counts characters in a string", function() {
1312
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
1413
} );
1514
it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() {
16-
expect( countCharactersFunction( "this is a string" ) ).toBe( 16 );
15+
expect( countCharactersFunction( "this is a string" ) ).toBe( 13 );
1716
expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " +
18-
"(представляващи краен брой знаци)." ) ).toBe( 90 );
17+
"(представляващи краен брой знаци)." ) ).toBe( 78 );
1918
} );
2019
it( "makes sure that the table of contents is excluded from the calculation", function() {
2120
const text = "<div class=\"wp-block-yoast-seo-table-of-contents yoast-table-of-contents\"><h2>目次</h2><ul><li><a " +
@@ -36,9 +35,9 @@ describe( "counts characters in a string", function() {
3635
"<p>戦後においては一般に広義の<a href=\"https://ja.wikipedia.org/wiki/%E7%AB%A5%E8%AC%A1\">童謡</a>にカテゴライズされる本作品は、" +
3736
"初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" +
3837
"本作品を歌うことは原則上はできなかった。</p>";
39-
expect( countCharactersFunction( text ) ).toBe( 757 );
38+
expect( countCharactersFunction( text ) ).toBe( 744 );
4039
} );
41-
it( "makes sure that no charachters are counted when a URL is embedded in video tags", function() {
40+
it( "makes sure that no characters are counted when a URL is embedded in video tags", function() {
4241
const text = "<!-- wp:embed {\"url\":\"https://www.youtube.com/watch?v=cbP2N1BQdYc\",\"type\":\"video\"," +
4342
"\"providerNameSlug\":\"youtube\",\"responsive\":true,\"className\":\"wp-embed-aspect-16-9 wp-has-aspect-ratio\"} -->\n" +
4443
"\t\t\t<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect" +
@@ -48,22 +47,3 @@ describe( "counts characters in a string", function() {
4847
expect( countCharactersFunction( text ) ).toBe( 0 );
4948
} );
5049
} );
51-
52-
describe( "A test to return a regex match for URLs", () => {
53-
it( "Returns true if there is a match against a URL starting with www", () => {
54-
// eslint-disable-next-line max-len
55-
expect( doesWordMatchRegex( "www.yoast.com", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
56-
} );
57-
it( "Returns true if there is a match against a URL starting with https", () => {
58-
// eslint-disable-next-line max-len
59-
expect( doesWordMatchRegex( "https://www.codecademy.com/learn/hello", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
60-
} );
61-
it( "Returns true if there is a match against a URL starting with http", () => {
62-
// eslint-disable-next-line max-len
63-
expect( doesWordMatchRegex( "http://foo.com/blah_blah/", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
64-
} );
65-
it( "Returns false if there is no match", () => {
66-
// eslint-disable-next-line max-len
67-
expect( doesWordMatchRegex( "My cat is sweet!", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( false );
68-
} );
69-
} );

packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInsightsSpec.js

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,42 @@ describe( "test for prominent words research for languages that have custom help
175175
expect( words ).toEqual( expected );
176176
} );
177177
} );
178+
179+
describe( "test for filtering out URLs and email addresses", function() {
180+
it( "does not include URLs in prominent words", function() {
181+
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "cats ".repeat( 50 ) );
182+
183+
const researcher = new Researcher( paper );
184+
researcher.addResearchData( "morphology", morphologyData );
185+
186+
const words = getProminentWordsForInsights( paper, researcher );
187+
expect( words ).toEqual( [
188+
new ProminentWord( "cats", "cat", 50 ),
189+
] );
190+
} );
191+
192+
it( "does not include email addresses in prominent words", function() {
193+
const paper = new Paper( "example89@something.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
194+
195+
const researcher = new Researcher( paper );
196+
researcher.addResearchData( "morphology", morphologyData );
197+
198+
const words = getProminentWordsForInsights( paper, researcher );
199+
expect( words ).toEqual( [
200+
new ProminentWord( "cats", "cat", 50 ),
201+
] );
202+
} );
203+
204+
it( "includes domain names in prominent words", function() {
205+
const paper = new Paper( "example.com ".repeat( 180 ) + "cats ".repeat( 50 ) );
206+
207+
const researcher = new Researcher( paper );
208+
researcher.addResearchData( "morphology", morphologyData );
209+
210+
const words = getProminentWordsForInsights( paper, researcher );
211+
expect( words ).toEqual( [
212+
new ProminentWord( "example.com", "example.com", 180 ),
213+
new ProminentWord( "cats", "cat", 50 ),
214+
] );
215+
} );
216+
} );

packages/yoastseo/spec/languageProcessing/researches/getProminentWordsForInternalLinkingSpec.js

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,5 +381,73 @@ describe( "test for prominent words research for languages that have custom help
381381

382382
expect( words ).toEqual( expected );
383383
} );
384+
385+
it( "does not count URLs and email addresses as prominent words", function() {
386+
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ),
387+
{ title: "example@something.com example@something.com example@something.com" } );
388+
389+
const researcher = new Researcher( paper );
390+
391+
const expected = {
392+
prominentWords: [],
393+
hasMetaDescription: false,
394+
hasTitle: true,
395+
};
396+
397+
const words = prominentWordsResearch( paper, researcher );
398+
399+
expect( words ).toEqual( expected );
400+
} );
401+
402+
it( "counts domain names as prominent words", function() {
403+
const paper = new Paper( "yoast.com ".repeat( 180 ) );
404+
405+
const researcher = new Researcher( paper );
406+
407+
const expected = {
408+
prominentWords: [ new ProminentWord( "yoast.com", "yoast.com", 180 ) ],
409+
hasMetaDescription: false,
410+
hasTitle: false,
411+
};
412+
413+
const words = prominentWordsResearch( paper, researcher );
414+
415+
expect( words ).toEqual( expected );
416+
} );
417+
418+
it( "does not return prominent words when the text is longer than 100 words including URLs and emails, but shorter" +
419+
"than 100 words when they are excluded", function() {
420+
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
421+
" cats".repeat( 50 ), { title: "example@something.com example@something.com example@something.com" } );
422+
423+
const researcher = new Researcher( paper );
424+
425+
const expected = {
426+
prominentWords: [],
427+
hasMetaDescription: false,
428+
hasTitle: true,
429+
};
430+
431+
const words = prominentWordsResearch( paper, researcher );
432+
433+
expect( words ).toEqual( expected );
434+
} );
435+
436+
it( "returns prominent words when the text is longer than 100 words after excluding URLs and emails", function() {
437+
const paper = new Paper( "http://blog.example.com/examples ".repeat( 180 ) + "example@something.com ".repeat( 180 ) +
438+
" cats".repeat( 101 ), { title: "example@something.com example@something.com example@something.com" } );
439+
440+
const researcher = new Researcher( paper );
441+
442+
const expected = {
443+
prominentWords: [ new ProminentWord( "cats", "cats", 101 ) ],
444+
hasMetaDescription: false,
445+
hasTitle: true,
446+
};
447+
448+
const words = prominentWordsResearch( paper, researcher );
449+
450+
expect( words ).toEqual( expected );
451+
} );
384452
} );
385453

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
const emailRegex = new RegExp( "[^\\s@]+@[^\\s@]+\\.[^\\s@]+", "igm" );
2+
3+
/**
4+
* Removes email addresses from a text.
5+
*
6+
* @param {string} text The text to remove emails from.
7+
*
8+
* @returns {string} The text without email addresses.
9+
*/
10+
export default function( text ) {
11+
return text.replace( emailRegex, "" );
12+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
const urlRegex = new RegExp( "(ftp|http(s)?:\\/\\/.)(www\\\\.)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)" +
2+
"|www\\.[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:;%_\\/+.~#?&()=]*)", "igm" );
3+
/**
4+
* This regex is used to match URLs, whether they are embedded in tags or not.
5+
* It doesn't match domain names (e.g. "yoast.com" in "We got so much traffic on yoast.com after the latest release").
6+
*
7+
* @param {string} text The text to remove URLs from.
8+
*
9+
* @returns {string} The text without URLs.
10+
*/
11+
export default function( text ) {
12+
return text.replace( urlRegex, "" );
13+
}
Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
import { languageProcessing } from "yoastseo";
22
const { sanitizeString } = languageProcessing;
3+
import removeURLs from "../../../helpers/sanitize/removeURLs.js";
34

45
/**
5-
* Calculates the character count of a text, including punctuation and numbers. Is used to determine length of text.
6+
* Calculates the character count which serves as a measure of text length.
7+
* The character count includes letters, punctuation, and numbers. It doesn't include URLs, HTML tags, spaces, and the
8+
* content of the Table of Contents and Estimated Reading Time blocks.
69
*
710
* @param {string} text The text to be counted.
811
*
912
* @returns {number} The character count of the given text.
1013
*/
1114
export default function( text ) {
12-
// This regex is used to match URLs in the text, either embedded in tags or not, so that they are excluded from the characters count.
13-
// eslint-disable-next-line max-len
14-
const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
15-
text = text.replace( urlRegex, "" );
15+
text = removeURLs( text );
1616
text = sanitizeString( text );
17+
text = text.replace( /\s/g, "" );
1718

1819
return text.length;
1920
}

0 commit comments

Comments
 (0)