metrics-report/metrics_report.Rmd at main · QualitativeDataRepository/metrics-report · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
---
title: "QDR Metrics Report"
author: "Qualitative Data Repository"
date: "Generated `r Sys.time()`"
output:
  pdf_document:
    includes:
      in_header: header.tex
  html_document: default
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE, warning=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
opts <- options(knitr.kable.NA = "0")
library(httr)
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggfortify)
library(stringi)

# load other functions
source("R/im.R") # instutitonal members stats
source("R/deposits.R") # deposits on dataverse
source("R/mdc.R") # make data count stats

### Configurable variables

# Custom start and end dates: use YYYY-MM-DD format
# These are specified from the workflow
# If these aren't provided, we'll: default to today's date as end date and to the beginning of the year as the start date
period_begin <- as.Date(Sys.getenv("period_begin"))
period_end <- as.Date(Sys.getenv("period_end"))

# Calculate dates: We use YTD if none are given
if (is.na(period_begin)) {
  period_begin <- as.Date(paste(as.integer(format(Sys.Date(), "%Y")),
                              "01-01", sep = "-"))}
if (is.na(period_end)){
  period_end <- Sys.Date()
}

# Set tokens and file location settings
dataverse_host <- "data.qdr.syr.edu"
dataverse_key <- Sys.getenv("DATAVERSE_TOKEN") # Github secret
drupal_folder <- paste0(Sys.getenv("EXTERNAL_GH_FOLDER"), "/drupal") # Github secret

# How many logins before we assume a user is QDR staff?
QDR_staff_logins <- 10

# Institutional members
im <- read_csv(paste0(Sys.getenv("EXTERNAL_GH_FOLDER"), "/institutional_members.csv"))
if (file.exists(paste0(Sys.getenv("EXTERNAL_GH_FOLDER"), "")))
# Add the prospective institutional member (input from github actions workflow)
extend <- Sys.getenv("extend_im")
if (extend!="") {
  extend <- unlist(strsplit(extend, ","))
  im <- im %>% bind_rows(tibble(
    canonical=extend, drupal=extend, domain=NA, regexp=extend))
}

# Drupal data, format as necessary
use <- read_csv(paste0(drupal_folder, "/use-stats.csv"))
login <- read_csv(paste0(drupal_folder, "/login-history.csv"))
login$Created <- as.Date(login$Created, format = "%m/%d/%y")
login$`Last login` <- as.Date(login$`Last login`, format = "%a, %m/%d/%Y - %R")
login$Institution <- toupper(login$Institution)
login$Institution <- trimws(login$Institution)
login$full_name <- with(login, paste(`Last Name`, `First Name`, sep=", "))

# Use number of logins specified above for QDR membership
login$frequent_flyer <- login$`Logins during time period` > `QDR_staff_logins`

login$recent <- as.Date(login$`Last login`) > period_begin

# This runs the helper function
login$member <- checkif_member(login, im)

# Calculate basic drupal summary stats
drupal_users <- nrow(login)
drupal_new_users <- sum(login$Created < period_end & login$Created > period_begin)
drupal_unique_logins <- as.integer(sum(login$recent))
drupal_logins_without_ff <- sum(login$`Logins during time period`[!login$frequent_flyer])

# Download and format dataverse data
dataverse <- paste0('http://', dataverse_host, "/api/info/metrics")
# Locally cache dataverse data as RDS files
# This is mainly useful because it reduces API calls when compiling to multiple formats (i.e. PDF+HTML)
if (file.exists("deposits.rds")) {
  deposits <- readRDS("deposits.rds")
  projects <- readRDS("projects.rds")
  mdc <- readRDS("mdc.rds")
} else {
  # This implements the search API to get all available data for all datasets
  deposits <- deposits_get(dataverse_host, dataverse_key)
  deposits$dvurl <- paste0("http://data.qdr.syr.edu/api/datasets/:persistentId/?persistentId=", deposits$global_id)

  institutions  <-  lapply(deposits$contacts, function(contact_group) {
      # For each contact in the group, extract just the affiliation
      lapply(contact_group, function(contact) {
        contact$affiliation
      })
    })


  # We query individual deposit to get *depositor* affiliation
  # This extra API call would be unnecessary to only get contact affiliation
  # institutions <- lapply(deposits$dvurl, deposit_affiliations, dataverse_key)

print(deposits)
  # Run all the depositor affiliations against a regex of institutional members
print(member_regexp(institutions, im))
deposits$member <- member_regexp(institutions, im)


  saveRDS(deposits, "deposits.rds")

  # These are aggregate stats, only require one API call
  projects <- content(GET(paste0(dataverse, "/uniquedownloads/monthly")))
  projects <- bind_rows(projects$data)
  projects$date <- as.Date(paste0(projects$date, "-01"))
  saveRDS(projects, "projects.rds")

  mdc <- mdc_get(dataverse_host)
  saveRDS(mdc, "mdc.rds")
}
```

## Parameters

- Reporting period: `r period_begin` to `r period_end`.
- Non-member institutions considered: `r extend`

## Drupal

- QDR currently has `r drupal_users` users registered in Drupal.
- `r drupal_new_users` of those users were created between `r period_begin` and `r period_end`.
- `r drupal_unique_logins` unique users have logged in since `r period_begin`.
- There have been at least `r drupal_unique_logins` unique logins since `r period_begin`.

<!--Based on the time period used to generate the drupal login csv file:-->

<!--- If we subtract presumed QDR staff members (> `r QDR_staff_logins` logins/user), this number drops to `r drupal_logins_without_ff`. -->

```{r drupal}
login %>% group_by(Institution) %>% summarise(users=length(`User status`)) %>%
  arrange(desc(users)) %>% slice(1:25) %>%
  knitr::kable(row.names = FALSE, col.names = c("Institution", "Affiliated users"),
               caption="Top institutional affiliations by number of users")

use %>% select(-`Views today`) %>% arrange(desc(`Total views`)) %>%
  slice(1:10) %>%
  mutate(Title = paste0("[", Title, "](",
                      "https://qdr.syr.edu", `Link to Content`, ")")) %>%
  select(!`Link to Content`) %>%
  knitr::kable(row.names = FALSE, col.names = c("Title", "Views"),
               caption="Most viewed QDR pages")
```

```{r members_drupal}
# Get total n per member
im_drupal <- count(group_by(login, member, .drop=FALSE))

# Get recent logins since period begin
im_drupal <- login %>% group_by(member, .drop=FALSE) %>% summarize(recent=sum(recent)) %>% full_join(im_drupal)

# Get created between period_begin and period_end
im_drupal <- login %>% group_by(member, .drop=FALSE) %>% filter(Created < period_end & Created > period_begin) %>%
  summarize(created_since=length(`Email`)) %>% full_join(im_drupal) %>%
  arrange(desc(n)) %>% replace_na(list(created_since=0))

# Here we write drupal member data to an external file
# This was by request, in order to have an easier parsed version than the table
im_drupal %>% filter(!is.na(member)) %>%
  replace_na(replace = list(created_since=0, recent=FALSE, n=0)) %>%
  write_csv(paste0(Sys.getenv("EXTERNAL_GH_FOLDER"), "/output/im_drupal.csv"))

# Now display the same data as a table in the PDF/HTML
opts <- options(knitr.kable.NA = "N/A")
im_drupal %>%
  knitr::kable(col.names = c("Institution",
                             paste("Created between", period_begin, "and", period_end),
                             paste("Min unique logins since", period_begin),
                             "Total users"),
               caption="QDR Accounts and Logins by Institutional Members")
```

## Dataverse

```{r total_projects}
dataverse_total <- nrow(deposits)

deposits_new <- deposits %>% filter(as.Date(createdAt) < period_end & as.Date(createdAt) > period_begin) %>%
  summarize(n=n()) %>% .$n

published_new <- deposits %>% filter(as.Date(published_at) < period_end & as.Date(published_at) > period_begin) %>%
  summarize(n=n()) %>% .$n
```

- There are currently `r dataverse_total` projects.
- `r deposits_new` new deposits were initiated between `r period_begin` and `r period_end`.
- `r published_new` projects were published between `r period_begin` and `r period_end`.

```{r subject}
opts <- options(knitr.kable.NA = 0)

deposits %>% group_by(versionState) %>% summarize(n=n()) %>%
  knitr::kable(caption="Total deposits by publication status", col.names=c("Status", "Deposits"))

# Summarize published projects by subject
deposits_subj <- deposits %>%
  filter(versionState=="Published") %>% select(c("subjects")) %>%
  unnest_longer(subjects) %>% group_by(subjects) %>% summarize(published=n())

# any of those published projects created during the specified period?
if  (nrow(subset(deposits, as.Date(createdAt) < period_end &
               as.Date(createdAt) > period_begin))==0) {

  deposits_subj <- deposits %>%
    filter(as.Date(published_at) < period_end & as.Date(published_at) > period_begin &
             versionState=="Published") %>%
    select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>%
    summarize(period=n()) %>% full_join(deposits_subj)

} else {
  deposits_subj$period <- 0
}

# Any unpublished projects?
if (any(deposits$versionState=="Unpublished")) {
  # join with the published projects table
  deposits_subj <- deposits %>% filter(versionState=="Unpublished") %>% select(c("subjects")) %>%
    unnest_longer(subjects) %>% group_by(subjects) %>%
    summarize(unpublished=n()) %>% full_join(deposits_subj)
} else {
  deposits_subj$unpublished <- 0
}

# Any unpublished in the time period specified?
if (any(subset(deposits, as.Date(createdAt) < period_end &
               as.Date(createdAt) > period_begin)$versionState=="Unpublished")) {
  deposits_subj <- deposits %>% filter(versionState == "Unpublished" &
                                      as.Date(createdAt) < period_end &
                                      as.Date(createdAt) > period_begin) %>%
    select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>%
    summarize(unpublished_period=n()) %>% full_join(deposits_subj)
} else {
  deposits_subj$unpublished_period <- 0
}

# make the table on publications by subject
# published and unpublished, total and within-period
deposits_subj[, c("subjects", "published",
                  "unpublished", "period", "unpublished_period")] %>%
  knitr::kable(caption="Deposits by subject",
               col.names=c("Subject", "Total published", "Total unpublished",
                           "Published in reporting period",
                           "Unpublished in reporting period"))
```

```{r top}
# Downloaded projects

# All time downloaded projects, sort from the scraped dataframe and add clickable links
projects %>% group_by(pid) %>% summarize(sum(count)) %>%
  arrange(desc(`sum(count)`)) %>% slice(1:5) %>%
  mutate(pid = paste0("[", deposits$name[deposits$global_id %in% pid], "](",
                      "https://data.qdr.syr.edu/dataset.xhtml?persistentId=",
                      pid, ")")) %>%
  knitr::kable(caption="Top 5 all time downloaded projects",
             row.names = FALSE, col.names = c("Project", "Downloads"))

# same as above, but only for the specified period
projects %>% group_by(pid) %>% filter(date < period_end &
                      date > period_begin) %>%
  summarize(sum(count)) %>% arrange(desc(`sum(count)`)) %>% slice(1:5) %>%
  mutate(pid = paste0("[", deposits$name[deposits$global_id %in% pid], "](",
                      "https://data.qdr.syr.edu/dataset.xhtml?persistentId=",
                      pid, ")")) %>%
  knitr::kable(caption=paste("Top 5 downloaded projects between ", period_begin,
                             "and", period_end),
             row.names = FALSE, col.names = c("Project", "Downloads"))
```


### Institutional members

```{r member_projects}
# projects deposited by users associated with institutional members
im_deposit <- deposits %>% filter(!is.na(member)) %>%
  group_by(member, versionState, .drop=FALSE) %>%
  tally()

# collect stats of published and unpublished projects per member
# both total and within period
im_deposit <-
  deposits %>% filter(!is.na(member) & as.Date(createdAt) < period_end & as.Date(createdAt) > period_begin) %>%
  group_by(member, versionState, .drop=FALSE) %>%
  summarize(period=length(global_id)) %>% full_join(im_deposit) %>%
  pivot_wider(names_from=versionState, names_sep=".", values_from=c(n, period), values_fill=0) %>%
  select(n.Published, n.Unpublished, period.Published, period.Unpublished)

# generate the table
im_deposit %>% knitr::kable(caption="Deposits associated with institutional members",
               col.names = c("Institutional member name",
                             "Total published", "Total unpublished",
                             "Published in reporting period", "Unpublished in reporting period"))

# Write the institutional member dataverse data to an external file as well
# Same as the table included immediately above
# This is easier to parse than the table that is included in the PDF/HTML
im_deposit %>%
  replace_na(list(n.Published=0, n.Unpublished=0, period.Published=0, period.Unpublished=0)) %>%
  write_csv(paste0(Sys.getenv("EXTERNAL_GH_FOLDER"), "/output/im_deposits.csv"))

# Sort the top projects per member and create clickable links
projects %>% group_by(global_id=pid) %>% summarize(downloads=sum(count)) %>%
  inner_join(deposits) %>% select("name", "global_id", "member", "downloads") %>%
  filter(!is.na(member)) %>% arrange(desc(downloads)) %>%
  group_by(member, .drop=FALSE) %>% slice_head(n=3) %>%
  mutate(name = paste0("[", name, "](",
                      "https://data.qdr.syr.edu/dataset.xhtml?persistentId=",
                      global_id, ")")) %>% select(!global_id) %>%
  # and output the table
  knitr::kable(caption="Top 3 downloads from each member institution",
               col.names = c("Deposit", "Member institution", "Downloads"),
               linesep = "\\addlinespace")
```

## Make Data Count (MDC)

### Understanding MDC data

MDC reports statistics as "total" and "unique".

- Unique" views/downloads correspond to the number of unique sessions in which a data project has been viewed/downloaded. Roughly translates to: "X people have viewed/downloaded this project or its files."
- Total views/downloads correspond to the number of times a project has been viewed/downloaded, including, e.g., multiple counts for multiple views/downloads in the same session or by the same user.


```{r mdc}
# not all dataverse instances use MDC data (QDR does). In case they don't, we use the regular downloads stats
if (exists("mdc")) {
  stats_ts <- ts(mdc[, -1], frequency=12, start=strsplit(mdc$date[1], split="-")[[1]])
} else {
  stats_ts <- ts(downloads[, -1], frequency=12,
                start=strsplit(downloads$date[1], split="-")[[1]])
}

# plot the available stats
autoplot(stats_ts, facets = FALSE) +  labs(title="Statistics", color="Statistic") +
    scale_y_continuous(labels=scales::comma)

```