-
Notifications
You must be signed in to change notification settings - Fork 117
Expand file tree
/
Copy pathget-software.js
More file actions
109 lines (89 loc) · 3.51 KB
/
get-software.js
File metadata and controls
109 lines (89 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
const fs = require('fs');
const path = require('path');
const axios = require('axios');
const yaml = require('js-yaml');
const normalizeRepoUrl = (url) => url.toLowerCase().replace(/.git$/, '');
const absoluteUrl = (url, repo) => {
if (!url || (/^http(s)?:\/\//i).test(url)) {
return url;
}
const repoUrl = new URL(normalizeRepoUrl(repo));
switch (repoUrl.host.toLowerCase()) {
case 'github.com':
return 'https://raw.githubusercontent.com' + path.join(repoUrl.pathname, 'HEAD', url)
case 'bitbucket.org':
return 'https://bitbucket.org' + path.join(repoUrl.pathname, 'raw/HEAD', url)
default:
// GitLab
return `${repoUrl.protocol}//${repoUrl.hostname}` + path.join(repoUrl.pathname, '-/raw/HEAD', url)
}
};
const addSlug = (software) => ({ ...software, slug: software.id });
const addAliases = (software) => {
const aliases = [normalizeRepoUrl(software.url), ...software.aliases.map(a => normalizeRepoUrl(a))];
const uniqNames = [...new Set(aliases.map(a => a.replace('https://', '')))];
return { ...software, alias_pages: uniqNames }
}
const addPubliccodeDict = (software) => {
const publiccode = yaml.load(software.publiccodeYml);
return { ...software, publiccode }
}
async function fetchData(url, pageSize = 100) {
let afterCursor = '';
let allData = [];
do {
const response = await axios.get(url, {
params: {
'page[size]': pageSize,
'page[after]': afterCursor,
},
});
allData = allData.concat(response.data.data);
afterCursor = response.data.links.next ? response.data.links.next.split('page[after]=')[1] : '';
} while (afterCursor);
return allData;
}
function toElasticSearchBulkFile(software, filename) {
software
.map(software => addSlug(software))
.map(software => addPubliccodeDict(software))
.map(software => ({ ...software, publiccode: { ...software.publiccode, logo: absoluteUrl(software.publiccode.logo, software.url) } }))
.forEach(s => {
const metadata = {
'index': {
'_index': 'developers_italia_it',
'_id': s.id,
}
}
fs.writeFileSync(filename, JSON.stringify(metadata) + '\n', { flag: 'a' });
const doc = {
'crawltime': s.updatedAt,
'slug': s.slug,
'publiccode': s.publiccode,
'type': 'software',
}
fs.writeFileSync(filename, JSON.stringify(doc) + '\n', { flag: 'a' });
})
}
const url = 'https://api.developers.italia.it/v1/software';
async function run() {
const software = await fetchData(url);
toElasticSearchBulkFile(software, 'elasticsearch.bulk')
// Adapt the data structure to the legacy software.yml format
const data = software
.map(software => addAliases(addSlug(software)))
.map(software => addSlug(software))
.map(software => addPubliccodeDict(software))
.map(software => ({ ...software, publiccode: { ...software.publiccode, logo: absoluteUrl(software.publiccode.logo, software.url) } }));
data.forEach(software => {
Object.keys(software.publiccode.description).forEach(lang => {
const desc = software.publiccode.description[lang];
software.publiccode.description[lang].screenshots = desc.screenshots?.map(ss => absoluteUrl(ss, software.url));
})
});
// Remove the "url" key as it's used by Jekyll to hold the generated page's URL
// and Searchyll uses it to compare against when the ignore: options are set in _config.yml.
const jekyll = data.map(({ url, ...rest }) => rest)
fs.writeFileSync('_data/crawler/software.yml', yaml.dump(jekyll));
}
run();