-
Notifications
You must be signed in to change notification settings - Fork 953
Expand file tree
/
Copy pathgetProminentWordsForInternalLinking.js
More file actions
119 lines (103 loc) · 5.04 KB
/
getProminentWordsForInternalLinking.js
File metadata and controls
119 lines (103 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import { take } from "lodash-es";
import countWords from "../helpers/word/countWords";
import {
collapseProminentWordsOnStem,
filterProminentWords,
getProminentWords,
getProminentWordsFromPaperAttributes,
retrieveAbbreviations,
sortProminentWords,
} from "../helpers/prominentWords/determineProminentWords";
import { getSubheadingsTopLevel, removeSubheadingsTopLevel } from "../helpers/html/getSubheadings";
import baseStemmer from "../helpers/morphology/baseStemmer";
import removeURLs from "../helpers/sanitize/removeURLs.js";
import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
/**
* Removes URLs and email addresses from the text.
*
* @param {string} text The text to sanitize.
*
* @returns {string} The text without URLs and email addresses.
*/
const sanitizeText = function( text ) {
text = removeURLs( text );
return removeEmailAddresses( text );
};
/**
* Retrieves the prominent words from the given paper.
*
* @param {Paper} paper The paper to determine the prominent words of.
* @param {Researcher} researcher The researcher to use for analysis.
*
* @returns {Object} result A compound result object.
* @returns {ProminentWord[]} result.prominentWords Prominent words for this paper, filtered and sorted.
* @returns {boolean} result.hasMetaDescription Whether the metadescription is available in the input paper.
* @returns {boolean} result.hasTitle Whether the title is available in the input paper.
*/
function getProminentWordsForInternalLinking( paper, researcher ) {
const functionWords = researcher.getConfig( "functionWords" );
// An optional custom helper to return custom function to return the stem of a word.
const customStemmer = researcher.getHelper( "customGetStemmer" );
const stemmer = customStemmer ? customStemmer( researcher ) : researcher.getHelper( "getStemmer" )( researcher );
// An optional custom helper to get words from the text.
const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" );
// An optional custom helper to count length to use instead of countWords.
const customCountLength = researcher.getHelper( "customCountLength" );
const text = sanitizeText( paper.getText() );
const metadescription = sanitizeText( paper.getDescription() );
const title = sanitizeText( paper.getTitle() );
const result = {};
result.hasMetaDescription = metadescription !== "";
result.hasTitle = title !== "";
result.prominentWords = [];
/**
* We only want to return suggestions (and spend time calculating prominent words) if the text is at least 100 words.
* And when a customCountLength is available, we only want to return the suggestions if the text has at least 200 characters.
*/
if ( customCountLength ) {
if ( customCountLength( text ) < 200 ) {
return result;
}
} else if ( countWords( text ) < 100 ) {
return result;
}
const subheadings = getSubheadingsTopLevel( text ).map( subheading => subheading[ 2 ] );
const attributes = [
paper.getKeyword(),
paper.getSynonyms(),
title,
metadescription,
subheadings.join( " " ),
];
// If the language has a custom helper to get words from the text, we don't retrieve the abbreviation.
const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text.concat( attributes.join( " " ) ) );
const removedSubheadingText = removeSubheadingsTopLevel( text );
const prominentWordsFromText = getProminentWords( removedSubheadingText, abbreviations, stemmer, functionWords, getWordsCustomHelper );
const prominentWordsFromPaperAttributes = getProminentWordsFromPaperAttributes(
attributes, abbreviations, stemmer, functionWords, getWordsCustomHelper );
/*
* If a word is used in any of the attributes, its weight is automatically high.
* To make sure the word survives weight filters and gets saved in the database, make the number of occurrences times-3.
*/
prominentWordsFromPaperAttributes.forEach( relevantWord => relevantWord.setOccurrences( relevantWord.getOccurrences() * 3 ) );
const collapsedWords = collapseProminentWordsOnStem( prominentWordsFromPaperAttributes.concat( prominentWordsFromText ) );
sortProminentWords( collapsedWords );
/*
* If morphology data are available for a language, the minimum number of occurrences to consider a word to be prominent is 4.
* This minimum number was chosen in order to avoid premature suggestions of words from the paper attributes.
* These get a times-3 boost and would therefore be prominent with just 1 occurrence.
*
* If morphology data are not available, and therefore word forms are not recognized, the minimum threshold is lowered to 2.
*/
let minimumNumberOfOccurrences = 4;
if ( stemmer === baseStemmer ) {
minimumNumberOfOccurrences = 2;
}
/*
* Return the 100 top items from the collapsed and sorted list. The number is picked deliberately to prevent larger
* articles from getting too long of lists.
*/
result.prominentWords = take( filterProminentWords( collapsedWords, minimumNumberOfOccurrences ), 100 );
return result;
}
export default getProminentWordsForInternalLinking;