Skip to content
Merged
1 change: 1 addition & 0 deletions doc/release-notes/xmlutil.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The configuration of XML parsers used in Dataverse has been centralized and unused functionality has been turned off to enhance security.
2 changes: 1 addition & 1 deletion doc/sphinx-guides/source/api/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ v6.7
- An undocumented :doc:`search` parameter called "show_my_data" has been removed. It was never exercised by tests and is believed to be unused. API users should use the :ref:`api-mydata` API instead.
- /api/datasets/{id}/curationStatus API now includes a JSON object with curation label, createtime, and assigner rather than a string 'label' and it supports a new boolean includeHistory parameter (default false) that returns a JSON array of statuses
- /api/datasets/{id}/listCurationStates includes new columns "Status Set Time" and "Status Set By" columns listing the time the current status was applied and by whom. It also supports the boolean includeHistory parameter.
- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primiarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser.
- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser.

v6.6
----
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import edu.harvard.iq.dataverse.datavariable.VariableCategory;
import edu.harvard.iq.dataverse.datavariable.VariableMetadataDDIParser;
import edu.harvard.iq.dataverse.search.IndexServiceBean;

import edu.harvard.iq.dataverse.util.xml.XmlUtil;
import jakarta.ejb.EJB;
import jakarta.ejb.EJBException;
import jakarta.ejb.Stateless;
Expand Down Expand Up @@ -355,13 +355,19 @@ private boolean updateDraftVersion(ArrayList<VariableMetadata> neededToUpdateVM,

private void readXML(InputStream body, Map<Long,VariableMetadata> mapVarToVarMet, Map<Long,VarGroup> varGroupMap) throws XMLStreamException, NullPointerException {

XMLInputFactory factory=XMLInputFactory.newInstance();
XMLInputFactory factory=XmlUtil.getSecureXMLInputFactory();
XMLStreamReader xmlr=factory.createXMLStreamReader(body);

VariableMetadataDDIParser vmdp = new VariableMetadataDDIParser();

vmdp.processDataDscr(xmlr, mapVarToVarMet, varGroupMap);

if (xmlr != null) {
try {
xmlr.close();
} catch (XMLStreamException e) {
logger.warning("XMLStreamException closing XMLStreamReader in readXml");
}
}
}

private boolean newGroups(Map<Long,VarGroup> varGroupMap, FileMetadata fm) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import edu.harvard.iq.dataverse.license.License;
import edu.harvard.iq.dataverse.license.LicenseServiceBean;
import edu.harvard.iq.dataverse.util.StringUtil;
import edu.harvard.iq.dataverse.util.xml.XmlUtil;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
Expand Down Expand Up @@ -121,8 +123,7 @@ public class ImportDDIServiceBean {
// TODO: stop passing the xml source as a string; (it could be huge!) -- L.A. 4.5
// TODO: what L.A. Said.
public DatasetDTO doImport(ImportType importType, String xmlToParse) throws XMLStreamException, ImportException {
xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); DatasetDTO datasetDTO = this.initializeDataset();
DatasetDTO datasetDTO = this.initializeDataset();

// Read docDescr and studyDesc into DTO objects.
// TODO: the fileMap is likely not needed.
Expand All @@ -147,11 +148,16 @@ public Map<String, String> mapDDI(ImportType importType, String xmlToParse, Data
Map<String, String> filesMap = new HashMap<>();
StringReader reader = new StringReader(xmlToParse);
XMLStreamReader xmlr = null;
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
xmlFactory.setProperty("javax.xml.stream.isCoalescing", true); // allows the parsing of a CDATA segment into a single event
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
xmlr = xmlFactory.createXMLStreamReader(reader);
processDDI(importType, xmlr, datasetDTO, filesMap);

if (xmlr != null) {
try {
xmlr.close();
} catch (XMLStreamException e) {
logger.warning("XMLStreamException closing XMLStreamReader in mapDDI()");
}
}
return filesMap;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import edu.harvard.iq.dataverse.util.StringUtil;
import edu.harvard.iq.dataverse.util.json.JsonParseException;
import edu.harvard.iq.dataverse.util.json.JsonParser;
import edu.harvard.iq.dataverse.util.xml.XmlUtil;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
Expand Down Expand Up @@ -106,7 +108,7 @@ public void importXML(String xmlToParse, String foreignFormat, DatasetVersion da

try {
reader = new StringReader(xmlToParse);
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
xmlr = xmlFactory.createXMLStreamReader(reader);
DatasetDTO datasetDTO = processXML(xmlr, mappingSupported);

Expand Down Expand Up @@ -173,7 +175,7 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo

try {
reader = new StringReader(DcXmlToParse);
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
xmlr = xmlFactory.createXMLStreamReader(reader);

//while (xmlr.next() == XMLStreamConstants.COMMENT); // skip pre root comments
Expand All @@ -184,6 +186,13 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo
processXMLElement(xmlr, ":", OAI_DC_OPENING_TAG, dublinCoreMapping, datasetDTO);
} catch (XMLStreamException ex) {
throw new EJBException("ERROR occurred while parsing XML fragment (" + DcXmlToParse.substring(0, 64) + "...); ", ex);
} finally {
if (xmlr != null) {
try {
xmlr.close();
} catch (XMLStreamException ex) {
}
}
}


Expand Down Expand Up @@ -555,9 +564,7 @@ public ImportGenericServiceBean() {

public ImportGenericServiceBean(ImportType importType) {
this.importType=importType;
xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE);

xmlInputFactory = XmlUtil.getSecureXMLInputFactory();
}


Expand All @@ -583,21 +590,24 @@ public Map<String, String> mapDCTerms(String xmlToParse, DatasetDTO datasetDTO)
Map<String, String> filesMap = new HashMap<>();
StringReader reader = new StringReader(xmlToParse);
XMLStreamReader xmlr = null;
XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
xmlr = xmlFactory.createXMLStreamReader(reader);
processDCTerms(xmlr, datasetDTO, filesMap);

if (xmlr != null) {
try {
xmlr.close();
} catch (XMLStreamException ex) {
}
}
return filesMap;
}


public Map<String, String> mapDCTerms(File ddiFile, DatasetDTO datasetDTO) {
FileInputStream in = null;
XMLStreamReader xmlr = null;
Map<String, String> filesMap = new HashMap<>();

try {
in = new FileInputStream(ddiFile);
try (FileInputStream in = new FileInputStream(ddiFile)) {
xmlr = xmlInputFactory.createXMLStreamReader(in);
processDCTerms( xmlr, datasetDTO , filesMap );
} catch (FileNotFoundException ex) {
Expand All @@ -606,14 +616,11 @@ public Map<String, String> mapDCTerms(File ddiFile, DatasetDTO datasetDTO) {
} catch (XMLStreamException ex) {
Logger.getLogger("global").log(Level.SEVERE, null, ex);
throw new EJBException("ERROR occurred in mapDDI.", ex);
} catch (IOException e) {
} finally {
try {
if (xmlr != null) { xmlr.close(); }
} catch (XMLStreamException ex) {}

try {
if (in != null) { in.close();}
} catch (IOException ex) {}
}

return filesMap;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import edu.harvard.iq.dataverse.authorization.providers.oauth2.OAuth2UserRecord;
import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser;
import edu.harvard.iq.dataverse.util.BundleUtil;
import edu.harvard.iq.dataverse.util.xml.XmlUtil;

import java.io.IOException;
import java.io.StringReader;
import java.util.*;
Expand Down Expand Up @@ -111,52 +113,50 @@ final protected OAuth2UserRecord getUserRecord(@NotNull String responseBody, @No

@Override
protected ParsedUserResponse parseUserResponse(String responseBody) {
DocumentBuilderFactory dbFact = DocumentBuilderFactory.newInstance();
try ( StringReader reader = new StringReader(responseBody)) {
DocumentBuilder db = dbFact.newDocumentBuilder();
Document doc = db.parse( new InputSource(reader) );

String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names" )
.stream().findFirst().map( Node::getTextContent )
.map( String::trim ).orElse("");
String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name")
.stream().findFirst().map( Node::getTextContent )
.map( String::trim ).orElse("");

// fallback - try to use the credit-name
if ( (firstName + familyName).equals("") ) {
firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name" )
.stream().findFirst().map( Node::getTextContent )
.map( String::trim ).orElse("");
}

String primaryEmail = getPrimaryEmail(doc);
List<String> emails = getAllEmails(doc);

// make the username up
String username;
if ( primaryEmail.length() > 0 ) {
username = primaryEmail.split("@")[0];
} else {
username = firstName.split(" ")[0] + "." + familyName;
DocumentBuilder db = XmlUtil.getSecureDocumentBuilder();
if (db != null) {
Document doc = db.parse(new InputSource(reader));

String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names")
.stream().findFirst().map(Node::getTextContent)
.map(String::trim).orElse("");
String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name")
.stream().findFirst().map(Node::getTextContent)
.map(String::trim).orElse("");

// fallback - try to use the credit-name
if ((firstName + familyName).equals("")) {
firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name")
.stream().findFirst().map(Node::getTextContent)
.map(String::trim).orElse("");
}

String primaryEmail = getPrimaryEmail(doc);
List<String> emails = getAllEmails(doc);

// make the username up
String username;
if (primaryEmail.length() > 0) {
username = primaryEmail.split("@")[0];
} else {
username = firstName.split(" ")[0] + "." + familyName;
}
username = username.replaceAll("[^a-zA-Z0-9.]", "");

// returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it
// on the access token response.
// Affiliation added after a later call.
final ParsedUserResponse userResponse = new ParsedUserResponse(
new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username);
userResponse.emails.addAll(emails);

return userResponse;
}
username = username.replaceAll("[^a-zA-Z0-9.]","");

// returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it
// on the access token response.
// Affilifation added after a later call.
final ParsedUserResponse userResponse = new ParsedUserResponse(
new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username);
userResponse.emails.addAll(emails);

return userResponse;

} catch (SAXException ex) {
logger.log(Level.SEVERE, "XML error parsing response body from ORCiD: " + ex.getMessage(), ex);
} catch (IOException ex) {
logger.log(Level.SEVERE, "I/O error parsing response body from ORCiD: " + ex.getMessage(), ex);
} catch (ParserConfigurationException ex) {
logger.log(Level.SEVERE, "While parsing the ORCiD response: Bad parse configuration. " + ex.getMessage(), ex);
}

return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import edu.harvard.iq.dataverse.util.SystemConfig;
import edu.harvard.iq.dataverse.util.json.JsonUtil;
import edu.harvard.iq.dataverse.util.xml.XmlPrinter;
import edu.harvard.iq.dataverse.util.xml.XmlUtil;
import edu.harvard.iq.dataverse.util.xml.XmlWriterUtil;

import java.io.ByteArrayOutputStream;
Expand All @@ -41,20 +42,15 @@
import java.util.Map.Entry;
import java.util.logging.Level;
import java.util.logging.Logger;
import jakarta.ejb.EJB;
import jakarta.json.Json;
import jakarta.json.JsonArray;
import jakarta.json.JsonArrayBuilder;
import jakarta.json.JsonObject;
import jakarta.json.JsonString;
import jakarta.json.JsonValue;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

import javax.xml.XMLConstants;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
import org.w3c.dom.Document;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -2012,17 +2008,24 @@ private static void createFileDscr(XMLStreamWriter xmlw, JsonArray fileDetails)


public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStream) throws XMLStreamException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();

try {
Document document;
InputStream styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl");
// Get secure DocumentBuilder from our utility class
DocumentBuilder builder = XmlUtil.getSecureDocumentBuilder();
if (builder == null) {
logger.severe("Could not create secure document builder");
return;
}
InputStream styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl");

DocumentBuilder builder = factory.newDocumentBuilder();
document = builder.parse(datafile);
Document document = builder.parse(datafile);

// Use a Transformer for output
TransformerFactory tFactory = TransformerFactory.newInstance();
// Set secure processing feature
tFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
tFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");

StreamSource stylesource = new StreamSource(styleSheetInput);
Transformer transformer = tFactory.newTransformer(stylesource);

Expand All @@ -2035,20 +2038,14 @@ public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStrea
} catch (TransformerException te) {
// Error generated by the parser
logger.severe("Transformation error" + " " + te.getMessage());

} catch (SAXException sxe) {
// Error generated by this application
// (or a parser-initialization error)
logger.severe("SAX error " + sxe.getMessage());

} catch (ParserConfigurationException pce) {
// Parser with specified options can't be built
logger.severe("Parser configuration error " + pce.getMessage());
} catch (IOException ioe) {
// I/O error
logger.warning("I/O error " + ioe.getMessage());
}

}

public static void injectSettingsService(SettingsServiceBean settingsSvc) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
package edu.harvard.iq.dataverse.harvest.client;

import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler;
import edu.harvard.iq.dataverse.util.xml.XmlUtil;

import java.io.IOException;

import java.io.InputStream;
Expand Down Expand Up @@ -126,7 +128,7 @@ public boolean isDeleted () {
public void harvestRecord(String baseURL, String identifier, String metadataPrefix, Map<String,String> customHeaders, HttpClient httpClient) throws IOException,
ParserConfigurationException, SAXException, TransformerException{

xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
xmlInputFactory = XmlUtil.getSecureXMLInputFactory();
String requestURL = getRequestURL(baseURL, identifier, metadataPrefix);
InputStream in;

Expand Down
Loading