IQSS · ofahimIQSS · Jul 15, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/doc/release-notes/xmlutil.md b/doc/release-notes/xmlutil.md
@@ -0,0 +1 @@
+The configuration of XML parsers used in Dataverse has been centralized and unused functionality has been turned off to enhance security.
diff --git a/doc/sphinx-guides/source/api/changelog.rst b/doc/sphinx-guides/source/api/changelog.rst
@@ -13,7 +13,7 @@ v6.7
 - An undocumented :doc:`search` parameter called "show_my_data" has been removed. It was never exercised by tests and is believed to be unused. API users should use the :ref:`api-mydata` API instead.
 - /api/datasets/{id}/curationStatus API now includes a JSON object with curation label, createtime, and assigner rather than a string 'label' and it supports a new boolean includeHistory parameter (default false) that returns a JSON array of statuses
 - /api/datasets/{id}/listCurationStates includes new columns "Status Set Time" and "Status Set By" columns listing the time the current status was applied and by whom. It also supports the boolean includeHistory parameter. 
-- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primiarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser.
+- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser.
 
 v6.6
 ----

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java b/src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java
@@ -26,7 +26,7 @@
 import edu.harvard.iq.dataverse.datavariable.VariableCategory;
 import edu.harvard.iq.dataverse.datavariable.VariableMetadataDDIParser;
 import edu.harvard.iq.dataverse.search.IndexServiceBean;
-
+import edu.harvard.iq.dataverse.util.xml.XmlUtil;
 import jakarta.ejb.EJB;
 import jakarta.ejb.EJBException;
 import jakarta.ejb.Stateless;
@@ -355,13 +355,19 @@ private boolean updateDraftVersion(ArrayList<VariableMetadata> neededToUpdateVM,
 
     private void readXML(InputStream body, Map<Long,VariableMetadata> mapVarToVarMet, Map<Long,VarGroup> varGroupMap) throws XMLStreamException, NullPointerException {
 
-        XMLInputFactory factory=XMLInputFactory.newInstance();
+        XMLInputFactory factory=XmlUtil.getSecureXMLInputFactory();
         XMLStreamReader xmlr=factory.createXMLStreamReader(body);
 
         VariableMetadataDDIParser vmdp = new VariableMetadataDDIParser();
 
         vmdp.processDataDscr(xmlr, mapVarToVarMet, varGroupMap);
-
+        if (xmlr != null) {
+            try {
+                xmlr.close();
+            } catch (XMLStreamException e) {
+                logger.warning("XMLStreamException closing XMLStreamReader in readXml");
+            }
+        }
     }
 
     private boolean newGroups(Map<Long,VarGroup> varGroupMap, FileMetadata fm) {

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java
@@ -21,6 +21,8 @@
 import edu.harvard.iq.dataverse.license.License;
 import edu.harvard.iq.dataverse.license.LicenseServiceBean;
 import edu.harvard.iq.dataverse.util.StringUtil;
+import edu.harvard.iq.dataverse.util.xml.XmlUtil;
+
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -121,8 +123,7 @@ public class ImportDDIServiceBean {
     // TODO: stop passing the xml source as a string; (it could be huge!) -- L.A. 4.5
     // TODO: what L.A. Said.
     public DatasetDTO doImport(ImportType importType, String xmlToParse) throws XMLStreamException, ImportException {
-        xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
-        xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); DatasetDTO datasetDTO = this.initializeDataset();
+        DatasetDTO datasetDTO = this.initializeDataset();
 
         // Read docDescr and studyDesc into DTO objects.
         // TODO: the fileMap is likely not needed. 
@@ -147,11 +148,16 @@ public Map<String, String> mapDDI(ImportType importType, String xmlToParse, Data
         Map<String, String> filesMap = new HashMap<>();
         StringReader reader = new StringReader(xmlToParse);
         XMLStreamReader xmlr = null;
-        XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
-        xmlFactory.setProperty("javax.xml.stream.isCoalescing", true); // allows the parsing of a CDATA segment into a single event
+        XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
         xmlr = xmlFactory.createXMLStreamReader(reader);
         processDDI(importType, xmlr, datasetDTO, filesMap);
-
+        if (xmlr != null) {
+            try {
+                xmlr.close();
+            } catch (XMLStreamException e) {
+                logger.warning("XMLStreamException closing XMLStreamReader in mapDDI()");
+            }
+        }
         return filesMap;
     }
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java
@@ -23,6 +23,8 @@
 import edu.harvard.iq.dataverse.util.StringUtil;
 import edu.harvard.iq.dataverse.util.json.JsonParseException;
 import edu.harvard.iq.dataverse.util.json.JsonParser;
+import edu.harvard.iq.dataverse.util.xml.XmlUtil;
+
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -106,7 +108,7 @@ public void importXML(String xmlToParse, String foreignFormat, DatasetVersion da
 
         try {
             reader = new StringReader(xmlToParse);
-            XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
+            XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
             xmlr =  xmlFactory.createXMLStreamReader(reader);
             DatasetDTO datasetDTO = processXML(xmlr, mappingSupported);
 
@@ -173,7 +175,7 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo
 
         try {
             reader = new StringReader(DcXmlToParse);
-            XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
+            XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
             xmlr = xmlFactory.createXMLStreamReader(reader);
 
             //while (xmlr.next() == XMLStreamConstants.COMMENT); // skip pre root comments
@@ -184,6 +186,13 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo
             processXMLElement(xmlr, ":", OAI_DC_OPENING_TAG, dublinCoreMapping, datasetDTO);
         } catch (XMLStreamException ex) {
             throw new EJBException("ERROR occurred while parsing XML fragment  (" + DcXmlToParse.substring(0, 64) + "...); ", ex);
+        } finally {
+            if (xmlr != null) {
+                try {
+                    xmlr.close();
+                } catch (XMLStreamException ex) {
+                }
+            }
         }
 
 
@@ -555,9 +564,7 @@ public ImportGenericServiceBean() {
 
     public ImportGenericServiceBean(ImportType importType) {
         this.importType=importType;
-        xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
-        xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE);
-
+        xmlInputFactory = XmlUtil.getSecureXMLInputFactory();
     }
 
 
@@ -583,21 +590,24 @@ public Map<String, String> mapDCTerms(String xmlToParse, DatasetDTO datasetDTO)
         Map<String, String> filesMap = new HashMap<>();
         StringReader reader = new StringReader(xmlToParse);
         XMLStreamReader xmlr = null;
-        XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance();
+        XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory();
         xmlr = xmlFactory.createXMLStreamReader(reader);
         processDCTerms(xmlr, datasetDTO, filesMap);
-
+        if (xmlr != null) {
+            try {
+                xmlr.close();
+            } catch (XMLStreamException ex) {
+            }
+        }
         return filesMap;
     }
 
 
     public Map<String, String> mapDCTerms(File ddiFile, DatasetDTO datasetDTO) {
-        FileInputStream in = null;
         XMLStreamReader xmlr = null;
         Map<String, String> filesMap = new HashMap<>();
 
-        try {
-            in = new FileInputStream(ddiFile);
+        try (FileInputStream in = new FileInputStream(ddiFile)) {
             xmlr =  xmlInputFactory.createXMLStreamReader(in);
             processDCTerms( xmlr,  datasetDTO , filesMap );
         } catch (FileNotFoundException ex) {
@@ -606,14 +616,11 @@ public Map<String, String> mapDCTerms(File ddiFile, DatasetDTO datasetDTO) {
         } catch (XMLStreamException ex) {
             Logger.getLogger("global").log(Level.SEVERE, null, ex);
             throw new EJBException("ERROR occurred in mapDDI.", ex);
+        } catch (IOException e) {
         } finally {
             try {
                 if (xmlr != null) { xmlr.close(); }
             } catch (XMLStreamException ex) {}
-
-            try {
-                if (in != null) { in.close();}
-            } catch (IOException ex) {}
         }
 
         return filesMap;

diff --git a/...main/java/edu/harvard/iq/dataverse/authorization/providers/oauth2/impl/OrcidOAuth2AP.java b/...main/java/edu/harvard/iq/dataverse/authorization/providers/oauth2/impl/OrcidOAuth2AP.java
@@ -14,6 +14,8 @@
 import edu.harvard.iq.dataverse.authorization.providers.oauth2.OAuth2UserRecord;
 import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser;
 import edu.harvard.iq.dataverse.util.BundleUtil;
+import edu.harvard.iq.dataverse.util.xml.XmlUtil;
+
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.*;
@@ -111,52 +113,50 @@ final protected OAuth2UserRecord getUserRecord(@NotNull String responseBody, @No
 
     @Override
     protected ParsedUserResponse parseUserResponse(String responseBody) {
-        DocumentBuilderFactory dbFact = DocumentBuilderFactory.newInstance();
         try ( StringReader reader = new StringReader(responseBody)) {
-            DocumentBuilder db = dbFact.newDocumentBuilder();
-            Document doc = db.parse( new InputSource(reader) );
-
-            String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names" )
-                                .stream().findFirst().map( Node::getTextContent )
-                                    .map( String::trim ).orElse("");
-            String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name")
-                                .stream().findFirst().map( Node::getTextContent )
-                                    .map( String::trim ).orElse("");
-
-            // fallback - try to use the credit-name
-            if ( (firstName + familyName).equals("") ) {
-                firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name" )
-                                .stream().findFirst().map( Node::getTextContent )
-                                    .map( String::trim ).orElse("");
-            }
-
-            String primaryEmail = getPrimaryEmail(doc);
-            List<String> emails = getAllEmails(doc);
-
-            // make the username up
-            String username;
-            if ( primaryEmail.length() > 0 ) {
-                username = primaryEmail.split("@")[0];
-            } else {
-                username = firstName.split(" ")[0] + "." + familyName;
+            DocumentBuilder db = XmlUtil.getSecureDocumentBuilder();
+            if (db != null) {
+                Document doc = db.parse(new InputSource(reader));
+
+                String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names")
+                        .stream().findFirst().map(Node::getTextContent)
+                        .map(String::trim).orElse("");
+                String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name")
+                        .stream().findFirst().map(Node::getTextContent)
+                        .map(String::trim).orElse("");
+
+                // fallback - try to use the credit-name
+                if ((firstName + familyName).equals("")) {
+                    firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name")
+                            .stream().findFirst().map(Node::getTextContent)
+                            .map(String::trim).orElse("");
+                }
+
+                String primaryEmail = getPrimaryEmail(doc);
+                List<String> emails = getAllEmails(doc);
+
+                // make the username up
+                String username;
+                if (primaryEmail.length() > 0) {
+                    username = primaryEmail.split("@")[0];
+                } else {
+                    username = firstName.split(" ")[0] + "." + familyName;
+                }
+                username = username.replaceAll("[^a-zA-Z0-9.]", "");
+
+                // returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it
+                // on the access token response.
+                // Affiliation added after a later call.
+                final ParsedUserResponse userResponse = new ParsedUserResponse(
+                        new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username);
+                userResponse.emails.addAll(emails);
+
+                return userResponse;
             }
-            username = username.replaceAll("[^a-zA-Z0-9.]","");
-
-            // returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it
-            // on the access token response.
-            // Affilifation added after a later call.
-            final ParsedUserResponse userResponse = new ParsedUserResponse(
-                    new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username);
-            userResponse.emails.addAll(emails);
-
-            return userResponse;
-
         } catch (SAXException ex) {
             logger.log(Level.SEVERE, "XML error parsing response body from ORCiD: " + ex.getMessage(), ex);
         } catch (IOException ex) {
             logger.log(Level.SEVERE, "I/O error parsing response body from ORCiD: " + ex.getMessage(), ex);
-        } catch (ParserConfigurationException ex) {
-            logger.log(Level.SEVERE, "While parsing the ORCiD response: Bad parse configuration. " + ex.getMessage(), ex);
         }
 
         return null;

diff --git a/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java b/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java
@@ -29,6 +29,7 @@
 import edu.harvard.iq.dataverse.util.SystemConfig;
 import edu.harvard.iq.dataverse.util.json.JsonUtil;
 import edu.harvard.iq.dataverse.util.xml.XmlPrinter;
+import edu.harvard.iq.dataverse.util.xml.XmlUtil;
 import edu.harvard.iq.dataverse.util.xml.XmlWriterUtil;
 
 import java.io.ByteArrayOutputStream;
@@ -41,20 +42,15 @@
 import java.util.Map.Entry;
 import java.util.logging.Level;
 import java.util.logging.Logger;
-import jakarta.ejb.EJB;
-import jakarta.json.Json;
 import jakarta.json.JsonArray;
-import jakarta.json.JsonArrayBuilder;
 import jakarta.json.JsonObject;
 import jakarta.json.JsonString;
 import jakarta.json.JsonValue;
 import javax.xml.stream.XMLOutputFactory;
 import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.XMLStreamWriter;
-
+import javax.xml.XMLConstants;
 import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
 import org.xml.sax.SAXException;
 import org.w3c.dom.Document;
 import org.apache.commons.lang3.StringUtils;
@@ -2012,17 +2008,24 @@ private static void createFileDscr(XMLStreamWriter xmlw, JsonArray fileDetails)
 
 
     public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStream) throws XMLStreamException {
-        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
 
         try {
-            Document document;
-            InputStream  styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl");
+            // Get secure DocumentBuilder from our utility class
+            DocumentBuilder builder = XmlUtil.getSecureDocumentBuilder();
+            if (builder == null) {
+                logger.severe("Could not create secure document builder");
+                return;
+            }
+            InputStream styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl");
 
-            DocumentBuilder builder = factory.newDocumentBuilder();
-            document = builder.parse(datafile);
+            Document document = builder.parse(datafile);
 
             // Use a Transformer for output
             TransformerFactory tFactory = TransformerFactory.newInstance();
+            // Set secure processing feature
+            tFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+            tFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
+
             StreamSource stylesource = new StreamSource(styleSheetInput);
             Transformer transformer = tFactory.newTransformer(stylesource);
 
@@ -2035,20 +2038,14 @@ public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStrea
         } catch (TransformerException te) {
             // Error generated by the parser
             logger.severe("Transformation error" + "   " + te.getMessage());
-
         } catch (SAXException sxe) {
             // Error generated by this application
             // (or a parser-initialization error)
             logger.severe("SAX error " + sxe.getMessage());
-
-        } catch (ParserConfigurationException pce) {
-            // Parser with specified options can't be built
-            logger.severe("Parser configuration error " + pce.getMessage());
         } catch (IOException ioe) {
             // I/O error
             logger.warning("I/O error " + ioe.getMessage());
         }
-
     }
 
     public static void injectSettingsService(SettingsServiceBean settingsSvc) {

diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java
@@ -20,6 +20,8 @@
 package edu.harvard.iq.dataverse.harvest.client;
 
 import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler;
+import edu.harvard.iq.dataverse.util.xml.XmlUtil;
+
 import java.io.IOException;
 
 import java.io.InputStream;
@@ -126,7 +128,7 @@ public boolean isDeleted () {
     public void harvestRecord(String baseURL, String identifier, String metadataPrefix, Map<String,String> customHeaders, HttpClient httpClient) throws IOException,
         ParserConfigurationException, SAXException, TransformerException{
 
-        xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance();
+        xmlInputFactory = XmlUtil.getSecureXMLInputFactory();
         String requestURL = getRequestURL(baseURL, identifier, metadataPrefix);
         InputStream in;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The configuration of XML parsers used in Dataverse has been centralized and unused functionality has been turned off to enhance security.