-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathtesting_datacite_reporting.sh
More file actions
135 lines (111 loc) · 4.63 KB
/
testing_datacite_reporting.sh
File metadata and controls
135 lines (111 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash
# Datacite Reporting Script
# This script checks Datacite reports for a specific date and analyzes dataset usage statistics
# It handles report retrieval, data analysis, and notification of empty instance arrays
# Logging configuration
LOGFILE="datacite_reporting.log"
# Function to log and print messages
log() {
echo "$(date +"%Y-%m-%d %H:%M:%S") - $1" | tee -a "$LOGFILE"
}
# Function to check for errors and exit if found
check_error() {
if [ $? -ne 0 ]; then
log "ERROR: $1. Exiting."
exit 1
fi
}
# Function to check for required commands
check_required_commands() {
local missing_commands=()
local required_commands=(
"curl" "jq" "mail" "date" "wc"
)
for cmd in "${required_commands[@]}"; do
if ! command -v "$cmd" >/dev/null 2>&1; then
missing_commands+=("$cmd")
fi
done
if [ ${#missing_commands[@]} -ne 0 ]; then
log "Error: The following required commands are not installed:"
printf ' - %s\n' "${missing_commands[@]}" | tee -a "$LOGFILE"
echo
log "Please install these commands before running the script."
log "On Debian/Ubuntu systems, you can install them with:"
log "sudo apt-get install curl jq mailutils"
log "On RHEL/CentOS systems, you can install them with:"
log "sudo yum install curl jq mailx"
exit 1
fi
}
# Load environment variables from .env file
if [ -f "$(dirname "$0")/.env" ]; then
log "Loading environment variables from .env file..."
source "$(dirname "$0")/.env"
else
log "Error: .env file not found in $(dirname "$0")"
log "Please copy sample.env to .env and update the values."
exit 1
fi
# Validate required environment variables
required_vars=(
"DATACITE_ORG"
"EMAIL_RECIPIENT"
)
# Set YESTERDAY to the desired date (default: yesterday)
YESTERDAY=$(date -v-1d +%Y-%m-%d)
echo "Checking reports for date: $YESTERDAY"
# Get all reports, find the one whose end-date is $YESTERDAY, extract the report ID
REPORTS_JSON=$(curl -s "https://api.datacite.org/reports?created_by=$DATACITE_ORG")
# Check if we got any reports at all
if [ "$(echo "$REPORTS_JSON" | jq -r '.reports')" == "null" ]; then
echo "No reports found in the response."
exit 0
fi
REPORT_ID=$(echo "$REPORTS_JSON" | jq -r --arg YESTERDAY "$YESTERDAY" '.reports[]
| select(.["report-header"]["reporting-period"]["end-date"] == $YESTERDAY)
| .id')
if [ -z "$REPORT_ID" ]; then
echo "No report found for end-date $YESTERDAY."
exit 0
fi
echo "Found report ID: $REPORT_ID"
# Use the ID to fetch the specific report
REPORT_JSON=$(curl -s "https://api.datacite.org/reports/$REPORT_ID")
# Check if report-datasets exists and is not null
if [ "$(echo "$REPORT_JSON" | jq '.report["report-datasets"][].performance[] | {period, instance}')" == "null" ]; then
echo "Error:Report exists but contains no datasets."
exit 0
fi
# Get total number of datasets
LIST_DATASETS=$(echo "$REPORT_JSON" | jq '.report["report-datasets"][].performance[].instance[] | select(.["metric-type"] == "unique-dataset-investigations") | .count')
TOTAL_DATASET_VIEWS=$(echo "$LIST_DATASETS" | jq -s add)
TOTAL_DATASETS=$(echo "$LIST_DATASETS" | wc -l)
echo "Total number of datasets in report: $TOTAL_DATASETS"
echo "Total number of datasets views in report: $TOTAL_DATASET_VIEWS"
if [ "$TOTAL_DATASETS" -eq 0 ]; then
echo "Report exists but contains no datasets."
exit 0
fi
# Check datasets with their instance arrays
DATASETS_WITH_INSTANCES=$(echo "$REPORT_JSON" | jq '
.report["report-datasets"][].performance[] | {period, instance}' | wc -l)
DATASETS_WITHOUT_INSTANCES=$(echo "$REPORT_JSON" | jq '
.report["report-datasets"][].performance[] | select(.instance == []) | {period, instance}' | wc -l)
echo "Datasets with non-empty instance arrays: $DATASETS_WITH_INSTANCES"
echo "Datasets with empty instance arrays: $DATASETS_WITHOUT_INSTANCES"
# Only show detailed empty instance data if there are any
if [ "$DATASETS_WITHOUT_INSTANCES" -gt 0 ]; then
echo -e "\nDetailed list of datasets with empty instance arrays:"
echo "$REPORT_JSON" | jq '.report["report-datasets"][] | {
"dataset-title": .["dataset-title"],
"dataset-id": .["dataset-id"],
"uri": .uri,
"performance": [.performance[] | {period, instance}]
}'
fi
# If Total number of datasets in report isn't 0 and there are datasets with empty instance arrays, send an email
if [ "$TOTAL_DATASETS" -ne 0 ] && [ "$DATASETS_WITHOUT_INSTANCES" -gt 0 ]; then
echo "Sending email to $EMAIL_RECIPIENT"
echo "Subject: Dataverse Report for $YESTERDAY" | mail -s "Dataverse Report for $YESTERDAY" $EMAIL_RECIPIENT
fi