Skip to content

Commit b0baf9d

Browse files
authored
Merge pull request #17970 from Yoast/LINGO-1259-exclude-url-from-character-count
Exclude URLs from character count in Japanese
2 parents 8fc53b1 + 455e244 commit b0baf9d

4 files changed

Lines changed: 45 additions & 2 deletions

File tree

packages/yoastseo/spec/fullTextTests/testTexts/ja/japanesePaper.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ const expectedResults = {
6565
textLength: {
6666
isApplicable: true,
6767
score: 9,
68-
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3181 characters. Good job!",
68+
resultText: "<a href='https://yoa.st/34n' target='_blank'>Text length</a>: The text contains 3165 characters. Good job!",
6969
},
7070
externalLinks: {
7171
isApplicable: true,

packages/yoastseo/spec/languageProcessing/languages/ja/helpers/countCharactersSpec.js

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { doesWordMatchRegex } from "../../../../../src/languageProcessing/helpers/morphology/regexHelpers";
12
import countCharactersFunction from "../../../../../src/languageProcessing/languages/ja/helpers/countCharacters.js";
23

34
describe( "counts characters in a string", function() {
@@ -6,6 +7,11 @@ describe( "counts characters in a string", function() {
67
"高速運転が可能な標準軌新線を建設することを決定。1959年(昭和34年)4月20日、新丹那トンネル熱海口で起工式を行って着工し、" +
78
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
89
} );
10+
it( "returns the number of characters not including URL characters in the count", function() {
11+
expect( countCharactersFunction( "www.yoast.comこれに対し日本国有鉄道(国鉄)は、十河信二国鉄総裁と技師長の島秀雄の下、" +
12+
"高速運転が可能な標準軌新線を建設することを決定。1959年(昭和34年)4月20日、新丹那トンネル熱海口で起工式を行って着工し、" +
13+
"東京オリンピック開会直前の1964年(昭和39年)10月1日に開業した。" ) ).toBe( 136 );
14+
} );
915
it( "makes sure the countCharacters function still works when the input is a non-Japanese string", function() {
1016
expect( countCharactersFunction( "this is a string" ) ).toBe( 16 );
1117
expect( countCharactersFunction( "Низът в компютърните науки е крайна поредица от символи " +
@@ -30,7 +36,34 @@ describe( "counts characters in a string", function() {
3036
"<p>戦後においては一般に広義の<a href=\"https://ja.wikipedia.org/wiki/%E7%AB%A5%E8%AC%A1\">童謡</a>にカテゴライズされる本作品は、" +
3137
"初出本の題名にもあるとおり青木自身は「唱歌」であるとし、「学校や家庭で」歌ってもらえれば本懐であるとしている。しかし発表当時の教育現場では、" +
3238
"本作品を歌うことは原則上はできなかった。</p>";
39+
expect( countCharactersFunction( text ) ).toBe( 757 );
40+
} );
41+
it( "makes sure that no charachters are counted when a URL is embedded in video tags", function() {
42+
const text = "<!-- wp:embed {\"url\":\"https://www.youtube.com/watch?v=cbP2N1BQdYc\",\"type\":\"video\"," +
43+
"\"providerNameSlug\":\"youtube\",\"responsive\":true,\"className\":\"wp-embed-aspect-16-9 wp-has-aspect-ratio\"} -->\n" +
44+
"\t\t\t<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect" +
45+
"-16-9 wp-has-aspect-ratio\"><div class=\"wp-block-embed__wrapper\">\n" +
46+
"\t\t\t\thttps://www.youtube.com/watch?v=cbP2N1BQdYc\n" +
47+
"\t\t\t</div></figure><!-- /wp:embed -->";
48+
expect( countCharactersFunction( text ) ).toBe( 0 );
49+
} );
50+
} );
3351

34-
expect( countCharactersFunction( text ) ).toBe( 760 );
52+
describe( "A test to return a regex match for URLs", () => {
53+
it( "Returns true if there is a match against a URL starting with www", () => {
54+
// eslint-disable-next-line max-len
55+
expect( doesWordMatchRegex( "www.yoast.com", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
56+
} );
57+
it( "Returns true if there is a match against a URL starting with https", () => {
58+
// eslint-disable-next-line max-len
59+
expect( doesWordMatchRegex( "https://www.codecademy.com/learn/hello", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
60+
} );
61+
it( "Returns true if there is a match against a URL starting with http", () => {
62+
// eslint-disable-next-line max-len
63+
expect( doesWordMatchRegex( "http://foo.com/blah_blah/", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( true );
64+
} );
65+
it( "Returns false if there is no match", () => {
66+
// eslint-disable-next-line max-len
67+
expect( doesWordMatchRegex( "My cat is sweet!", "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)" ) ).toEqual( false );
3568
} );
3669
} );

packages/yoastseo/src/languageProcessing/helpers/sanitize/stripSpaces.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,11 @@ export default function( text ) {
1717
// Remove first/last character if space
1818
text = text.replace( /^\s+|\s+$/g, "" );
1919

20+
// Replace spaces followed by Japanese periods with only the period.
21+
text = text.replace( /\s/g, "。" );
22+
23+
// Replace spaces after Japanese periods with only the period.
24+
text = text.replace( /\s/g, "。" );
25+
2026
return text;
2127
}

packages/yoastseo/src/languageProcessing/languages/ja/helpers/countCharacters.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ const { sanitizeString } = languageProcessing;
99
* @returns {int} The word count of the given text.
1010
*/
1111
export default function( text ) {
12+
// This regex is used to match URLs in the text, either embedded in tags or not, so that they are excluded from the characters count.
13+
// eslint-disable-next-line max-len
14+
const urlRegex = new RegExp( "(http(s)?:\\/\\/.)?(www\\.|ftp:\\/\\/)?[-a-zA-Z0-9@:%._\\/+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\/+.~#?&()=]*)", "igm" );
15+
text = text.replace( urlRegex, "" );
1216
text = sanitizeString( text );
1317

1418
return text.length;

0 commit comments

Comments
 (0)