-
Notifications
You must be signed in to change notification settings - Fork 953
Expand file tree
/
Copy pathgetProminentWordsForInsights.js
More file actions
48 lines (41 loc) · 2.03 KB
/
getProminentWordsForInsights.js
File metadata and controls
48 lines (41 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import { take } from "lodash-es";
import {
collapseProminentWordsOnStem,
filterProminentWords,
getProminentWords,
retrieveAbbreviations,
sortProminentWords,
} from "../helpers/prominentWords/determineProminentWords";
import removeURLs from "../helpers/sanitize/removeURLs.js";
import removeEmailAddresses from "../helpers/sanitize/removeEmailAddresses";
/**
* Retrieves the prominent words from the given paper.
*
* @param {Paper} paper The paper to determine the prominent words of.
* @param {Researcher} researcher The researcher to use for analysis.
*
* @returns {WordCombination[]} Prominent words for this paper, filtered and sorted.
*/
function getProminentWordsForInsights( paper, researcher ) {
const functionWords = researcher.getConfig( "functionWords" );
// An optional custom helper to return custom function to return the stem of a word.
const customStemmer = researcher.getHelper( "customGetStemmer" );
const stemmer = customStemmer ? customStemmer( researcher ) : researcher.getHelper( "getStemmer" )( researcher );
// An optional custom helper to get words from the text.
const getWordsCustomHelper = researcher.getHelper( "getWordsCustomHelper" );
let text = paper.getText();
// We don't want to include URLs or email addresses in prominent words.
text = removeURLs( text );
text = removeEmailAddresses( text );
// If the language has a custom helper to get words from the text, we don't retrieve the abbreviation.
const abbreviations = getWordsCustomHelper ? [] : retrieveAbbreviations( text );
const prominentWordsFromText = getProminentWords( text, abbreviations, stemmer, functionWords, getWordsCustomHelper );
const collapsedWords = collapseProminentWordsOnStem( prominentWordsFromText );
sortProminentWords( collapsedWords );
/*
* Collapse the list of prominent words on stems, sort it, filter out all words that occur less than
* 5 times in the text. Return the 20 top items from this list.
*/
return take( filterProminentWords( collapsedWords, 5 ), 20 );
}
export default getProminentWordsForInsights;