Skip to content

Commit 122736c

Browse files
authored
Merge pull request #11486 from IQSS/11473-harvesting-client-ratelimit
Harvesting client improvements: configurable delay between GetRecord calls; a fix for a problem with long-running DataCite harvests
2 parents fe0c62f + 6b7284d commit 122736c

7 files changed

Lines changed: 209 additions & 4 deletions

File tree

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
A setting has been added for configuring sleep intervals between OAI calls for specific harvesting clients. Making it possible to harvest uninterrupted from servers enforcing rate limit policies. See the configuration guide for details. Additionally, this release fixes a problem with harvesting from DataCite OAI-PMH where initial, long-running harvests were failing on sets with large numbers of records.
2+
3+
## New Database Settings
4+
5+
- :HarvestingClientCallRateLimit

doc/sphinx-guides/source/installation/config.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4672,6 +4672,21 @@ Examples:
46724672

46734673
``curl -X PUT -d '{"default":"0", "CSV":"268435456"}' http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit``
46744674

4675+
.. _:HarvestingClientCallRateLimit:
4676+
4677+
:HarvestingClientCallRateLimit
4678+
++++++++++++++++++++++++++++++
4679+
4680+
This setting allows configuring sleep intervals between OAI calls for specific harvesting clients. Which makes it possible to harvest from servers that enforce rate limits.
4681+
4682+
The setting value is a serialized JSON object mapping client names to the specified intervals in fractional seconds. It is also possible to set a universal default interval for all harvesting clients on the instance (in a somewhat unlikely use case where this may be practically necessary).
4683+
4684+
In the following example, the harvester is instructed to sleep for 900 milliseconds between calls when running the client named ``harvarddv``, and to default to zero otherwise:
4685+
4686+
``curl -X PUT -d "{\"harvarddv\": 0.9, \"default\": 0}" "http://localhost:8080/api/admin/settings/:HarvestingClientCallRateLimit"``
4687+
4688+
Please note that the default in the example above is there for illustrative purposes and is otherwise redundant, since no sleep interval is the default behavior anyway.
4689+
46754690
.. _:ZipUploadFilesLimit:
46764691

46774692
:ZipUploadFilesLimit

src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler;
4646
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandlerException;
4747
import edu.harvard.iq.dataverse.search.IndexServiceBean;
48+
import edu.harvard.iq.dataverse.util.SystemConfig;
4849
import java.io.FileOutputStream;
4950
import java.io.FileWriter;
5051
import java.io.InputStream;
@@ -85,6 +86,8 @@ public class HarvesterServiceBean {
8586
EjbDataverseEngine engineService;
8687
@EJB
8788
IndexServiceBean indexService;
89+
@EJB
90+
SystemConfig systemConfig;
8891

8992
private static final Logger logger = Logger.getLogger("edu.harvard.iq.dataverse.harvest.client.HarvesterServiceBean");
9093
private static final SimpleDateFormat logFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss");
@@ -270,6 +273,8 @@ private void harvestOAI(DataverseRequest dataverseRequest, HarvestingClient harv
270273
}
271274

272275
private void harvestOAIviaListIdentifiers(OaiHandler oaiHandler, DataverseRequest dataverseRequest, HarvestingClient harvestingClient, HttpClient httpClient, List<String> failedIdentifiers, List<String> deletedIdentifiers, List<Long> harvestedDatasetIds, Logger harvesterLogger, PrintWriter importCleanupLog) throws OaiHandlerException, StopHarvestException {
276+
int sleepInterval = lookupSleepInterval(harvestingClient.getName());
277+
273278
for (Iterator<Header> idIter = oaiHandler.runListIdentifiers(); idIter.hasNext();) {
274279
// Before each iteration, check if this harvesting job needs to be aborted:
275280
if (checkIfStoppingJob(harvestingClient)) {
@@ -291,6 +296,8 @@ private void harvestOAIviaListIdentifiers(OaiHandler oaiHandler, DataverseReques
291296

292297
MutableBoolean getRecordErrorOccurred = new MutableBoolean(false);
293298

299+
sleepIfNeeded(sleepInterval);
300+
294301
// Retrieve and process this record with a separate GetRecord call:
295302
Long datasetId = processRecord(dataverseRequest, harvesterLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient);
296303

@@ -307,6 +314,22 @@ private void harvestOAIviaListIdentifiers(OaiHandler oaiHandler, DataverseReques
307314
}
308315

309316
private void harvestOAIviaListRecords(OaiHandler oaiHandler, DataverseRequest dataverseRequest, HarvestingClient harvestingClient, HttpClient httpClient, List<String> failedIdentifiers, List<String> deletedIdentifiers, List<Long> harvestedDatasetIds, Logger harvesterLogger, PrintWriter importCleanupLog) throws OaiHandlerException, StopHarvestException {
317+
/*
318+
* It is *exceptionally* unlikely that anyone will ever run into issues
319+
* with server rate limits when harvesting using the ListRecords method.
320+
* Since only one call needs to me be made in order to import multiple
321+
* datasets. The number of records served is nominally arbitrary and
322+
* varies from server to server. However, most known OAI servers will
323+
* serve 50 to 100 records at a time. If a server has a rate limit policy
324+
* of 300 calls/5 min. and their ListRecords serves 50 records per call,
325+
* Dataverse will need to import 50 datasets per second in order to run
326+
* afoul of the limit. Even with an empty database, Dataverse generally
327+
* doesn't work that fast.
328+
* But, it doesn't hurt to make it possible to define the interval
329+
* regardless, in case it is called for in some exotic scenario.
330+
**/
331+
int sleepInterval = lookupSleepInterval(harvestingClient.getName());
332+
310333
for (Iterator<Record> idIter = oaiHandler.runListRecords(); idIter.hasNext();) {
311334
// Before each iteration, check if this harvesting job needs to be aborted:
312335
if (checkIfStoppingJob(harvestingClient)) {
@@ -375,9 +398,33 @@ private void harvestOAIviaListRecords(OaiHandler oaiHandler, DataverseRequest da
375398
//can be uncommented out for testing failure handling:
376399
//throw new IOException("Exception occured, stopping harvest");
377400
}
401+
402+
sleepIfNeeded(sleepInterval);
378403
}
379404
}
380-
405+
406+
private int lookupSleepInterval(String clientName) {
407+
int sleepMilliseconds = 0;
408+
float clientIntervalValue = systemConfig.getHarvestingClientRequestInterval(clientName);
409+
410+
sleepMilliseconds = (int) (clientIntervalValue * 1000);
411+
logger.info("Sleep interval in milliseconds: " + sleepMilliseconds);
412+
413+
return sleepMilliseconds;
414+
}
415+
416+
private void sleepIfNeeded(int sleepInterval) {
417+
if (sleepInterval > 0) {
418+
logger.fine("Sleeping for " + sleepInterval + " milliseconds...");
419+
try {
420+
Thread.sleep(sleepInterval);
421+
} catch (InterruptedException iex) {
422+
logger.warning("InterruptedException trying to sleep for " + sleepInterval + " milliseconds");
423+
}
424+
}
425+
426+
}
427+
381428
private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, List<String> deletedIdentifiers, Date dateStamp, HttpClient httpClient) {
382429
String errMessage = null;
383430
Dataset harvestedDataset = null;

src/main/java/edu/harvard/iq/dataverse/harvest/client/oai/OaiHandler.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.io.IOException;
2222
import java.io.Serializable;
2323
import java.net.http.HttpClient;
24+
import java.time.Duration;
2425
import javax.xml.parsers.ParserConfigurationException;
2526

2627
import org.apache.commons.lang3.StringUtils;
@@ -147,14 +148,21 @@ public ServiceProvider getServiceProvider() throws OaiHandlerException {
147148

148149
context.withBaseUrl(baseOaiUrl);
149150
context.withGranularity(Granularity.Second);
150-
151-
JdkHttpOaiClient.Builder xoaiClientBuilder = JdkHttpOaiClient.newBuilder().withBaseUrl(getBaseOaiUrl());
151+
152+
// Note that we are defaulting to HTTP/1 (JDK HttpClient defaults to
153+
// HTTP/2 otherwise. By nature of OAI-PMH, HTTP/2 offers no practical
154+
// benefit. However, long-running harvests from servers supporting
155+
// HTTP/2 can fail due to a bug in JDK 17 (HttpClient does not
156+
// properly handle GoAway stream responses apparently). For example,
157+
// harvests from DataCite OAI-PMH were failing at 1 hour mark.
158+
JdkHttpOaiClient.Builder xoaiClientBuilder = (new JdkHttpOaiClient.JdkHttpBuilder(HttpClient.newBuilder().version(HttpClient.Version.HTTP_1_1))).withBaseUrl(getBaseOaiUrl());
152159
if (getCustomHeaders() != null) {
153160
for (String headerName : getCustomHeaders().keySet()) {
154161
logger.fine("adding custom header; name: "+headerName+", value: "+getCustomHeaders().get(headerName));
155162
}
156163
xoaiClientBuilder = xoaiClientBuilder.withCustomHeaders(getCustomHeaders());
157164
}
165+
xoaiClientBuilder = xoaiClientBuilder.withConnectTimeout(Duration.ofSeconds(180));
158166
context.withOAIClient(xoaiClientBuilder.build());
159167
context.withSaveUnparsedMetadata();
160168
serviceProvider = new ServiceProvider(context);

src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,7 @@ Whether Harvesting (OAI) service is enabled
765765
FileCategories,
766766
CreateDataFilesMaxErrorsToDisplay,
767767

768-
ContactFeedbackMessageSizeLimit,
768+
ContactFeedbackMessageSizeLimit,
769769
//Experimental setting to allow connecting to a GET external search service expecting a GET request with query parameter mirroring the search API query parameters (without search_service)
770770
GetExternalSearchUrl,
771771
//Experimental setting to provide a display name for the GET external search service
@@ -779,6 +779,8 @@ Whether Harvesting (OAI) service is enabled
779779
COARNotifyRelationshipAnnouncementTriggerFields,
780780
// JSON specification of the targets to send announcements to
781781
COARNotifyRelationshipAnnouncementTargets,
782+
// Configurable delay between harvesting calls, when required to avoid triggering rate limits
783+
HarvestingClientCallRateLimit
782784
;
783785

784786
@Override

src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ public class SystemConfig {
6565
* token is valid ({@link #getMinutesUntilPasswordResetTokenExpires}).
6666
*/
6767
private static final String PASSWORD_RESET_TIMEOUT_IN_MINUTES = "dataverse.auth.password-reset-timeout-in-minutes";
68+
69+
public static final String DEFAULT_KEY = "default";
6870

6971
/**
7072
* The default number of datafiles that we allow to be created through
@@ -601,6 +603,7 @@ public long getTabularIngestSizeLimit() {
601603
* or the default size limit if no format-specific limit is found or its name is invalid (null, blank, ...).
602604
* -1 = unlimited if not set, 0 if disabled or invalid, some long number of bytes otherwise
603605
*/
606+
604607
public long getTabularIngestSizeLimit(String formatName) {
605608
if (formatName != null && !formatName.isBlank()) {
606609
// We convert to lowercase so it doesn't matter which variant someone uses in the JSON config
@@ -610,6 +613,82 @@ public long getTabularIngestSizeLimit(String formatName) {
610613
return getTabularIngestSizeLimit();
611614
}
612615

616+
public Map<String, Float> getHarvestingClientRequestIntervals() {
617+
String settingString = settingsService.getValueForKey(SettingsServiceBean.Key.HarvestingClientCallRateLimit);
618+
if (settingString != null) {
619+
// Case A: the setting is using JSON to support multiple clients
620+
if (settingString.trim().startsWith("{")) {
621+
try (JsonReader reader = Json.createReader(new StringReader(settingString))) {
622+
JsonObject delays = reader.readObject();
623+
624+
Map<String, Float> limitsMap = new HashMap<>();
625+
// We add the default in case the JSON does not contain the default (which is optional).
626+
limitsMap.put(DEFAULT_KEY, 0F);
627+
628+
for (Map.Entry<String, JsonValue> clientEntry : delays.entrySet()) {
629+
String clientName = clientEntry.getKey();
630+
String lowercaseClientName = clientName.toLowerCase();
631+
632+
try {
633+
JsonValue value = clientEntry.getValue();
634+
float delayInterval;
635+
636+
// We want to be able to use either numbers or string values, so detect which one it is.
637+
// This is necessary as we need to tell the JSON parser what to do, it doesn't automatically handle this for us.
638+
if (value.getValueType() == JsonValue.ValueType.STRING) {
639+
delayInterval = Float.parseFloat(delays.getString(clientName));
640+
} else if (value.getValueType() == JsonValue.ValueType.NUMBER) {
641+
// Will throw if not a valid float number!
642+
delayInterval = delays.getJsonNumber(clientName).numberValue().floatValue(); //.doubleValue();
643+
} else {
644+
logger.warning(() -> "Invalid value type for client " + clientName + ": expected string or number");
645+
logger.warning("Disabling all harvesting client delay intervals completely until fixed!");
646+
return Map.of(DEFAULT_KEY, 0F);
647+
}
648+
649+
limitsMap.put(lowercaseClientName, delayInterval);
650+
} catch (NumberFormatException nfe) {
651+
logger.warning(() -> "Could not convert " + SettingsServiceBean.Key.HarvestingClientCallRateLimit + " entry to float for client " + clientName + " (not a valid number)");
652+
logger.warning("Disabling all harvesting client delay intervals completely until fixed!");
653+
return Map.of(DEFAULT_KEY, 0F);
654+
} catch (ArithmeticException ae) {
655+
logger.warning(() -> "Number too large, or otherwise invalid for client " + clientName);
656+
logger.warning("Disabling all harvesting client delay intervals completely until fixed!");
657+
return Map.of(DEFAULT_KEY, 0F);
658+
}
659+
}
660+
661+
return Collections.unmodifiableMap(limitsMap);
662+
} catch (JsonParsingException e) {
663+
logger.warning(() -> "Invalid " + SettingsServiceBean.Key.HarvestingClientCallRateLimit + " option found, cannot parse JSON: " + e.getMessage());
664+
logger.warning("Disabling all harvesting client delay intervals completely until fixed!");
665+
return Map.of(DEFAULT_KEY, 0F);
666+
}
667+
// Case B: It might be just a simple float, providing a default for all clients.
668+
} else {
669+
try {
670+
float delayInterval = Float.valueOf(settingString);
671+
return Map.of(DEFAULT_KEY, delayInterval);
672+
} catch (NumberFormatException nfe) {
673+
logger.warning(() -> "Could not convert " + SettingsServiceBean.Key.HarvestingClientCallRateLimit + " to float: " + nfe.getMessage());
674+
logger.warning("Disabling all harvesting client delay intervals completely until fixed!");
675+
return Map.of(DEFAULT_KEY, 0F);
676+
}
677+
}
678+
}
679+
// Default is not to limit at all
680+
return Map.of(DEFAULT_KEY, 0F);
681+
}
682+
683+
public float getHarvestingClientRequestInterval(String clientName) {
684+
if (clientName != null && !clientName.isBlank()) {
685+
// We convert to lowercase so it doesn't matter which variant someone uses in the JSON config
686+
String convertedClientName = clientName.toLowerCase();
687+
return getHarvestingClientRequestIntervals().getOrDefault(convertedClientName, getHarvestingClientRequestIntervals().get(DEFAULT_KEY));
688+
}
689+
return getHarvestingClientRequestIntervals().get(DEFAULT_KEY);
690+
}
691+
613692
public boolean isOAIServerEnabled() {
614693
boolean defaultResponse = false;
615694
return settingsService.isTrueForKey(SettingsServiceBean.Key.OAIServerEnabled, defaultResponse);

src/test/java/edu/harvard/iq/dataverse/util/SystemConfigTest.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,55 @@ void testGetTabularIngestSizeLimitsWithSingleInvalidValue() {
202202
assertEquals(1, result.size());
203203
assertEquals(0L, (long) result.get(SystemConfig.TABULAR_INGEST_SIZE_LIMITS_DEFAULT_KEY));
204204
}
205+
206+
@Test
207+
public void testGetHarvestingClientRequestIntervals() {
208+
209+
// Test with setting not set will return default 0.0
210+
// given
211+
doReturn(null).when(settingsService).getValueForKey(SettingsServiceBean.Key.HarvestingClientCallRateLimit);
212+
// when
213+
Map<String, Float> result = systemConfig.getHarvestingClientRequestIntervals();
214+
// then
215+
assertEquals(1, result.size());
216+
assertEquals(0, result.get(SystemConfig.DEFAULT_KEY));
217+
218+
// Test with good client
219+
String value = "{\"harvarddv\": 0.9, \"default\": 0.0}";
220+
// given
221+
doReturn(value).when(settingsService).getValueForKey(SettingsServiceBean.Key.HarvestingClientCallRateLimit);
222+
// when
223+
result = systemConfig.getHarvestingClientRequestIntervals();
224+
// then
225+
assertEquals(2, result.size());
226+
assertTrue(result.containsKey("harvarddv"));
227+
assertTrue(result.containsKey("default"));
228+
assertEquals(0.9F, systemConfig.getHarvestingClientRequestInterval("harvarddv"));
229+
assertEquals(0.0F, systemConfig.getHarvestingClientRequestInterval("notFoundSoDefault"));
230+
231+
// Test with missing default will create default 0.0
232+
value = "{\"harvarddv\": 0.9}";
233+
// given
234+
doReturn(value).when(settingsService).getValueForKey(SettingsServiceBean.Key.HarvestingClientCallRateLimit);
235+
// when
236+
result = systemConfig.getHarvestingClientRequestIntervals();
237+
// then
238+
assertEquals(2, result.size());
239+
assertTrue(result.containsKey("default"));
240+
assertEquals(0.0F, systemConfig.getHarvestingClientRequestInterval("default"));
241+
242+
// Test with invalid JSON (value as string instead of float) will default setting to default 0.0
243+
value = "{\"harvarddv1\": 0.9, \"harvarddv2\": \"string\"}";
244+
// given
245+
doReturn(value).when(settingsService).getValueForKey(SettingsServiceBean.Key.HarvestingClientCallRateLimit);
246+
// when
247+
result = systemConfig.getHarvestingClientRequestIntervals();
248+
// then
249+
assertEquals(1, result.size());
250+
assertTrue(result.containsKey("default"));
251+
assertTrue(!result.containsKey("harvarddv1"));
252+
assertTrue(!result.containsKey("harvarddv2"));
253+
}
205254

206255
@ParameterizedTest
207256
@ValueSource(strings = {"", "{ invalid: }"})

0 commit comments

Comments
 (0)