diff --git a/doc/release-notes/xmlutil.md b/doc/release-notes/xmlutil.md new file mode 100644 index 00000000000..e02ac032e4d --- /dev/null +++ b/doc/release-notes/xmlutil.md @@ -0,0 +1 @@ +The configuration of XML parsers used in Dataverse has been centralized and unused functionality has been turned off to enhance security. \ No newline at end of file diff --git a/doc/sphinx-guides/source/api/changelog.rst b/doc/sphinx-guides/source/api/changelog.rst index 50a3fb5a8a4..46b4a9e6f00 100644 --- a/doc/sphinx-guides/source/api/changelog.rst +++ b/doc/sphinx-guides/source/api/changelog.rst @@ -13,7 +13,7 @@ v6.7 - An undocumented :doc:`search` parameter called "show_my_data" has been removed. It was never exercised by tests and is believed to be unused. API users should use the :ref:`api-mydata` API instead. - /api/datasets/{id}/curationStatus API now includes a JSON object with curation label, createtime, and assigner rather than a string 'label' and it supports a new boolean includeHistory parameter (default false) that returns a JSON array of statuses - /api/datasets/{id}/listCurationStates includes new columns "Status Set Time" and "Status Set By" columns listing the time the current status was applied and by whom. It also supports the boolean includeHistory parameter. -- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primiarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser. +- Due to updates in libraries used by Dataverse, XML serialization may have changed slightly with respect to whether self-closing tags are used for empty elements. This primarily affects XML-based metadata exports. The XML structure of the export itself has not changed, so this is only an incompatibility if you are not using an XML parser. v6.6 ---- diff --git a/src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java b/src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java index 1a5b02b8ba3..1755e66823a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/EditDDI.java @@ -26,7 +26,7 @@ import edu.harvard.iq.dataverse.datavariable.VariableCategory; import edu.harvard.iq.dataverse.datavariable.VariableMetadataDDIParser; import edu.harvard.iq.dataverse.search.IndexServiceBean; - +import edu.harvard.iq.dataverse.util.xml.XmlUtil; import jakarta.ejb.EJB; import jakarta.ejb.EJBException; import jakarta.ejb.Stateless; @@ -355,13 +355,19 @@ private boolean updateDraftVersion(ArrayList neededToUpdateVM, private void readXML(InputStream body, Map mapVarToVarMet, Map varGroupMap) throws XMLStreamException, NullPointerException { - XMLInputFactory factory=XMLInputFactory.newInstance(); + XMLInputFactory factory=XmlUtil.getSecureXMLInputFactory(); XMLStreamReader xmlr=factory.createXMLStreamReader(body); VariableMetadataDDIParser vmdp = new VariableMetadataDDIParser(); vmdp.processDataDscr(xmlr, mapVarToVarMet, varGroupMap); - + if (xmlr != null) { + try { + xmlr.close(); + } catch (XMLStreamException e) { + logger.warning("XMLStreamException closing XMLStreamReader in readXml"); + } + } } private boolean newGroups(Map varGroupMap, FileMetadata fm) { diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java index 31941d3c8c0..fe8599150e7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportDDIServiceBean.java @@ -21,6 +21,8 @@ import edu.harvard.iq.dataverse.license.License; import edu.harvard.iq.dataverse.license.LicenseServiceBean; import edu.harvard.iq.dataverse.util.StringUtil; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; + import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -121,8 +123,7 @@ public class ImportDDIServiceBean { // TODO: stop passing the xml source as a string; (it could be huge!) -- L.A. 4.5 // TODO: what L.A. Said. public DatasetDTO doImport(ImportType importType, String xmlToParse) throws XMLStreamException, ImportException { - xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance(); - xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); DatasetDTO datasetDTO = this.initializeDataset(); + DatasetDTO datasetDTO = this.initializeDataset(); // Read docDescr and studyDesc into DTO objects. // TODO: the fileMap is likely not needed. @@ -147,11 +148,16 @@ public Map mapDDI(ImportType importType, String xmlToParse, Data Map filesMap = new HashMap<>(); StringReader reader = new StringReader(xmlToParse); XMLStreamReader xmlr = null; - XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance(); - xmlFactory.setProperty("javax.xml.stream.isCoalescing", true); // allows the parsing of a CDATA segment into a single event + XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory(); xmlr = xmlFactory.createXMLStreamReader(reader); processDDI(importType, xmlr, datasetDTO, filesMap); - + if (xmlr != null) { + try { + xmlr.close(); + } catch (XMLStreamException e) { + logger.warning("XMLStreamException closing XMLStreamReader in mapDDI()"); + } + } return filesMap; } diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java index 5e1c2108607..d75ea42e433 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java @@ -23,6 +23,8 @@ import edu.harvard.iq.dataverse.util.StringUtil; import edu.harvard.iq.dataverse.util.json.JsonParseException; import edu.harvard.iq.dataverse.util.json.JsonParser; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; + import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -106,7 +108,7 @@ public void importXML(String xmlToParse, String foreignFormat, DatasetVersion da try { reader = new StringReader(xmlToParse); - XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance(); + XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory(); xmlr = xmlFactory.createXMLStreamReader(reader); DatasetDTO datasetDTO = processXML(xmlr, mappingSupported); @@ -173,7 +175,7 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo try { reader = new StringReader(DcXmlToParse); - XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance(); + XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory(); xmlr = xmlFactory.createXMLStreamReader(reader); //while (xmlr.next() == XMLStreamConstants.COMMENT); // skip pre root comments @@ -184,6 +186,13 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier, boo processXMLElement(xmlr, ":", OAI_DC_OPENING_TAG, dublinCoreMapping, datasetDTO); } catch (XMLStreamException ex) { throw new EJBException("ERROR occurred while parsing XML fragment (" + DcXmlToParse.substring(0, 64) + "...); ", ex); + } finally { + if (xmlr != null) { + try { + xmlr.close(); + } catch (XMLStreamException ex) { + } + } } @@ -555,9 +564,7 @@ public ImportGenericServiceBean() { public ImportGenericServiceBean(ImportType importType) { this.importType=importType; - xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance(); - xmlInputFactory.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); - + xmlInputFactory = XmlUtil.getSecureXMLInputFactory(); } @@ -583,21 +590,24 @@ public Map mapDCTerms(String xmlToParse, DatasetDTO datasetDTO) Map filesMap = new HashMap<>(); StringReader reader = new StringReader(xmlToParse); XMLStreamReader xmlr = null; - XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance(); + XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory(); xmlr = xmlFactory.createXMLStreamReader(reader); processDCTerms(xmlr, datasetDTO, filesMap); - + if (xmlr != null) { + try { + xmlr.close(); + } catch (XMLStreamException ex) { + } + } return filesMap; } public Map mapDCTerms(File ddiFile, DatasetDTO datasetDTO) { - FileInputStream in = null; XMLStreamReader xmlr = null; Map filesMap = new HashMap<>(); - try { - in = new FileInputStream(ddiFile); + try (FileInputStream in = new FileInputStream(ddiFile)) { xmlr = xmlInputFactory.createXMLStreamReader(in); processDCTerms( xmlr, datasetDTO , filesMap ); } catch (FileNotFoundException ex) { @@ -606,14 +616,11 @@ public Map mapDCTerms(File ddiFile, DatasetDTO datasetDTO) { } catch (XMLStreamException ex) { Logger.getLogger("global").log(Level.SEVERE, null, ex); throw new EJBException("ERROR occurred in mapDDI.", ex); + } catch (IOException e) { } finally { try { if (xmlr != null) { xmlr.close(); } } catch (XMLStreamException ex) {} - - try { - if (in != null) { in.close();} - } catch (IOException ex) {} } return filesMap; diff --git a/src/main/java/edu/harvard/iq/dataverse/authorization/providers/oauth2/impl/OrcidOAuth2AP.java b/src/main/java/edu/harvard/iq/dataverse/authorization/providers/oauth2/impl/OrcidOAuth2AP.java index ddf527b95a7..f32ff2b18e5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/authorization/providers/oauth2/impl/OrcidOAuth2AP.java +++ b/src/main/java/edu/harvard/iq/dataverse/authorization/providers/oauth2/impl/OrcidOAuth2AP.java @@ -14,6 +14,8 @@ import edu.harvard.iq.dataverse.authorization.providers.oauth2.OAuth2UserRecord; import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; import edu.harvard.iq.dataverse.util.BundleUtil; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; + import java.io.IOException; import java.io.StringReader; import java.util.*; @@ -111,52 +113,50 @@ final protected OAuth2UserRecord getUserRecord(@NotNull String responseBody, @No @Override protected ParsedUserResponse parseUserResponse(String responseBody) { - DocumentBuilderFactory dbFact = DocumentBuilderFactory.newInstance(); try ( StringReader reader = new StringReader(responseBody)) { - DocumentBuilder db = dbFact.newDocumentBuilder(); - Document doc = db.parse( new InputSource(reader) ); - - String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names" ) - .stream().findFirst().map( Node::getTextContent ) - .map( String::trim ).orElse(""); - String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name") - .stream().findFirst().map( Node::getTextContent ) - .map( String::trim ).orElse(""); - - // fallback - try to use the credit-name - if ( (firstName + familyName).equals("") ) { - firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name" ) - .stream().findFirst().map( Node::getTextContent ) - .map( String::trim ).orElse(""); - } - - String primaryEmail = getPrimaryEmail(doc); - List emails = getAllEmails(doc); - - // make the username up - String username; - if ( primaryEmail.length() > 0 ) { - username = primaryEmail.split("@")[0]; - } else { - username = firstName.split(" ")[0] + "." + familyName; + DocumentBuilder db = XmlUtil.getSecureDocumentBuilder(); + if (db != null) { + Document doc = db.parse(new InputSource(reader)); + + String firstName = getNodes(doc, "person:person", "person:name", "personal-details:given-names") + .stream().findFirst().map(Node::getTextContent) + .map(String::trim).orElse(""); + String familyName = getNodes(doc, "person:person", "person:name", "personal-details:family-name") + .stream().findFirst().map(Node::getTextContent) + .map(String::trim).orElse(""); + + // fallback - try to use the credit-name + if ((firstName + familyName).equals("")) { + firstName = getNodes(doc, "person:person", "person:name", "personal-details:credit-name") + .stream().findFirst().map(Node::getTextContent) + .map(String::trim).orElse(""); + } + + String primaryEmail = getPrimaryEmail(doc); + List emails = getAllEmails(doc); + + // make the username up + String username; + if (primaryEmail.length() > 0) { + username = primaryEmail.split("@")[0]; + } else { + username = firstName.split(" ")[0] + "." + familyName; + } + username = username.replaceAll("[^a-zA-Z0-9.]", ""); + + // returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it + // on the access token response. + // Affiliation added after a later call. + final ParsedUserResponse userResponse = new ParsedUserResponse( + new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username); + userResponse.emails.addAll(emails); + + return userResponse; } - username = username.replaceAll("[^a-zA-Z0-9.]",""); - - // returning the parsed user. The user-id-in-provider will be added by the caller, since ORCiD passes it - // on the access token response. - // Affilifation added after a later call. - final ParsedUserResponse userResponse = new ParsedUserResponse( - new AuthenticatedUserDisplayInfo(firstName, familyName, primaryEmail, "", ""), null, username); - userResponse.emails.addAll(emails); - - return userResponse; - } catch (SAXException ex) { logger.log(Level.SEVERE, "XML error parsing response body from ORCiD: " + ex.getMessage(), ex); } catch (IOException ex) { logger.log(Level.SEVERE, "I/O error parsing response body from ORCiD: " + ex.getMessage(), ex); - } catch (ParserConfigurationException ex) { - logger.log(Level.SEVERE, "While parsing the ORCiD response: Bad parse configuration. " + ex.getMessage(), ex); } return null; diff --git a/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java b/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java index a27386893bd..f5cc86bf8ee 100644 --- a/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/export/ddi/DdiExportUtil.java @@ -29,6 +29,7 @@ import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.util.xml.XmlPrinter; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; import edu.harvard.iq.dataverse.util.xml.XmlWriterUtil; import java.io.ByteArrayOutputStream; @@ -41,20 +42,15 @@ import java.util.Map.Entry; import java.util.logging.Level; import java.util.logging.Logger; -import jakarta.ejb.EJB; -import jakarta.json.Json; import jakarta.json.JsonArray; -import jakarta.json.JsonArrayBuilder; import jakarta.json.JsonObject; import jakarta.json.JsonString; import jakarta.json.JsonValue; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; - +import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; import org.xml.sax.SAXException; import org.w3c.dom.Document; import org.apache.commons.lang3.StringUtils; @@ -2012,17 +2008,24 @@ private static void createFileDscr(XMLStreamWriter xmlw, JsonArray fileDetails) public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStream) throws XMLStreamException { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); try { - Document document; - InputStream styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl"); + // Get secure DocumentBuilder from our utility class + DocumentBuilder builder = XmlUtil.getSecureDocumentBuilder(); + if (builder == null) { + logger.severe("Could not create secure document builder"); + return; + } + InputStream styleSheetInput = DdiExportUtil.class.getClassLoader().getResourceAsStream("edu/harvard/iq/dataverse/codebook2-0.xsl"); - DocumentBuilder builder = factory.newDocumentBuilder(); - document = builder.parse(datafile); + Document document = builder.parse(datafile); // Use a Transformer for output TransformerFactory tFactory = TransformerFactory.newInstance(); + // Set secure processing feature + tFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + tFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + StreamSource stylesource = new StreamSource(styleSheetInput); Transformer transformer = tFactory.newTransformer(stylesource); @@ -2035,20 +2038,14 @@ public static void datasetHtmlDDI(InputStream datafile, OutputStream outputStrea } catch (TransformerException te) { // Error generated by the parser logger.severe("Transformation error" + " " + te.getMessage()); - } catch (SAXException sxe) { // Error generated by this application // (or a parser-initialization error) logger.severe("SAX error " + sxe.getMessage()); - - } catch (ParserConfigurationException pce) { - // Parser with specified options can't be built - logger.severe("Parser configuration error " + pce.getMessage()); } catch (IOException ioe) { // I/O error logger.warning("I/O error " + ioe.getMessage()); } - } public static void injectSettingsService(SettingsServiceBean settingsSvc) { diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java index 4f4fd39eb98..594e5f22170 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/FastGetRecord.java @@ -20,6 +20,8 @@ package edu.harvard.iq.dataverse.harvest.client; import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; + import java.io.IOException; import java.io.InputStream; @@ -126,7 +128,7 @@ public boolean isDeleted () { public void harvestRecord(String baseURL, String identifier, String metadataPrefix, Map customHeaders, HttpClient httpClient) throws IOException, ParserConfigurationException, SAXException, TransformerException{ - xmlInputFactory = javax.xml.stream.XMLInputFactory.newInstance(); + xmlInputFactory = XmlUtil.getSecureXMLInputFactory(); String requestURL = getRequestURL(baseURL, identifier, metadataPrefix); InputStream in; diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java index ef91793690e..43b2210efa4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/tabulardata/impl/plugins/xlsx/XLSXFileReader.java @@ -23,6 +23,9 @@ import java.io.*; import java.io.FileReader; import java.util.logging.*; + +import javax.xml.parsers.ParserConfigurationException; + import java.util.*; import edu.harvard.iq.dataverse.DataTable; @@ -33,6 +36,8 @@ import edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataIngest; import edu.harvard.iq.dataverse.util.BundleUtil; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; + import org.apache.commons.lang3.StringUtils; import org.apache.poi.xssf.eventusermodel.XSSFReader; @@ -246,7 +251,7 @@ public void processSheet(InputStream inputStream, DataTable dataTable, PrintWrit sheet1.close(); } - public XMLReader fetchSheetParser(SharedStrings sst, DataTable dataTable, PrintWriter tempOut) throws SAXException { + public XMLReader fetchSheetParser(SharedStrings sst, DataTable dataTable, PrintWriter tempOut) throws ParserConfigurationException, SAXException { // An attempt to use org.apache.xerces.parsers.SAXParser resulted // in some weird conflict in the app; the default XMLReader obtained // from the XMLReaderFactory (from xml-apis.jar) appears to be working @@ -260,7 +265,7 @@ public XMLReader fetchSheetParser(SharedStrings sst, DataTable dataTable, PrintW // unnecessary. // -- L.A. 4.0 alpha 1 - XMLReader xReader = XMLReaderFactory.createXMLReader(); + XMLReader xReader = XmlUtil.getSecureXMLReader(); dbglog.fine("creating new SheetHandler;"); ContentHandler handler = new SheetHandler(sst, dataTable, tempOut); xReader.setContentHandler(handler); diff --git a/src/main/java/edu/harvard/iq/dataverse/metadataimport/ForeignMetadataImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/metadataimport/ForeignMetadataImportServiceBean.java index 33f8277919a..1cd68016996 100644 --- a/src/main/java/edu/harvard/iq/dataverse/metadataimport/ForeignMetadataImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/metadataimport/ForeignMetadataImportServiceBean.java @@ -10,6 +10,8 @@ import edu.harvard.iq.dataverse.DatasetFieldValue; import edu.harvard.iq.dataverse.ForeignMetadataFieldMapping; import edu.harvard.iq.dataverse.ForeignMetadataFormatMapping; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; + import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -68,7 +70,7 @@ public void importXML(String xmlToParse, String foreignFormat, DatasetVersion da try { reader = new StringReader(xmlToParse); - XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance(); + XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory(); xmlr = xmlFactory.createXMLStreamReader(reader); processXML(xmlr, mappingSupported, datasetVersion); @@ -95,7 +97,7 @@ public void importXML(File xmlFile, String foreignFormat, DatasetVersion dataset try { in = new FileInputStream(xmlFile); - XMLInputFactory xmlFactory = javax.xml.stream.XMLInputFactory.newInstance(); + XMLInputFactory xmlFactory = XmlUtil.getSecureXMLInputFactory(); xmlr = xmlFactory.createXMLStreamReader(in); processXML(xmlr, mappingSupported, datasetVersion); } catch (FileNotFoundException ex) { diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index 451529a9bec..38707f5fc7c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -42,6 +42,7 @@ import edu.harvard.iq.dataverse.util.file.BagItFileHandler; import edu.harvard.iq.dataverse.util.file.CreateDataFileResult; import edu.harvard.iq.dataverse.util.file.BagItFileHandlerFactory; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; import edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil; import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatDoc; import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.HTML_H1; @@ -90,6 +91,8 @@ import jakarta.enterprise.inject.spi.CDI; import jakarta.json.JsonArray; import jakarta.json.JsonObject; + +import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; @@ -683,13 +686,11 @@ private static boolean isFITSFile(InputStream ins) { private static boolean isGraphMLFile(File file) { boolean isGraphML = false; logger.fine("begin isGraphMLFile()"); - FileReader fileReader = null; - try{ - fileReader = new FileReader(file); - javax.xml.stream.XMLInputFactory xmlif = javax.xml.stream.XMLInputFactory.newInstance(); - xmlif.setProperty("javax.xml.stream.isCoalescing", java.lang.Boolean.TRUE); + XMLStreamReader xmlr = null; + try(FileReader fileReader = new FileReader(file)) { + XMLInputFactory xmlif = XmlUtil.getSecureXMLInputFactory(); - XMLStreamReader xmlr = xmlif.createXMLStreamReader(fileReader); + xmlr = xmlif.createXMLStreamReader(fileReader); for (int event = xmlr.next(); event != XMLStreamConstants.END_DOCUMENT; event = xmlr.next()) { if (event == XMLStreamConstants.START_ELEMENT) { if (xmlr.getLocalName().equals("graphml")) { @@ -709,11 +710,11 @@ private static boolean isGraphMLFile(File file) { } catch(IOException e) { throw new EJBException(e); } finally { - if (fileReader != null) { + if (xmlr != null) { try { - fileReader.close(); - } catch (IOException ioex) { - logger.warning("IOException closing file reader in GraphML type checker"); + xmlr.close(); + } catch (XMLStreamException e) { + logger.warning("XMLStreamException closing XMLStreamReader in GraphML type checker"); } } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlPrinter.java b/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlPrinter.java index e568fad56d5..f253be1437a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlPrinter.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlPrinter.java @@ -3,6 +3,8 @@ import java.io.StringReader; import java.io.StringWriter; import java.util.logging.Logger; + +import javax.xml.XMLConstants; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; @@ -16,9 +18,13 @@ public class XmlPrinter { static public String prettyPrintXml(String xml) { try { - Transformer transformer = TransformerFactory.newInstance().newTransformer(); + TransformerFactory tf = TransformerFactory.newInstance(); + // Set secure processing feature + tf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); - // pretty print by indention + tf.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + Transformer transformer = tf.newTransformer(); + // pretty print with indentation transformer.setOutputProperty(OutputKeys.INDENT, "yes"); // set amount of whitespace during indent transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlUtil.java new file mode 100644 index 00000000000..1dcc4c95499 --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlUtil.java @@ -0,0 +1,122 @@ +package edu.harvard.iq.dataverse.util.xml; + +import java.util.logging.Level; +import java.util.logging.Logger; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.stream.XMLInputFactory; + +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +/** + * Utility class for XML processing with security settings to prevent XXE attacks. + */ +public class XmlUtil { + + private static final Logger logger = Logger.getLogger(XmlUtil.class.getCanonicalName()); + + /** + * Creates and returns a DocumentBuilderFactory with security settings to prevent XXE attacks. + * + * @return A secure DocumentBuilderFactory instance + */ + public static DocumentBuilderFactory getSecureDocumentBuilderFactory() { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + + try { + // Disable DTDs (doctypes) + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + + // Disable external entities + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + + // Disable entity expansion + factory.setExpandEntityReferences(false); + factory.setXIncludeAware(false); + + // Additional security settings + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + + } catch (ParserConfigurationException pce) { + // Parser with specified options can't be built + logger.log(Level.SEVERE, "Failed to configure secure DocumentBuilderFactory: {0}", pce.getMessage()); + } + + return factory; + } + + /** + * Creates and returns a DocumentBuilder with security settings to prevent XXE attacks. + * + * @return A secure DocumentBuilder instance or null if configuration fails + */ + public static DocumentBuilder getSecureDocumentBuilder() { + DocumentBuilderFactory factory = getSecureDocumentBuilderFactory(); + try { + return factory.newDocumentBuilder(); + } catch (ParserConfigurationException pce) { + // Parser with specified options can't be built + logger.log(Level.SEVERE, "Failed to create secure DocumentBuilder: {0}", pce.getMessage()); + return null; + } + } + + /** + * Creates and returns a secure XMLReader with protection against XXE attacks. + * + * @return A secure XMLReader instance + * @throws SAXException If there's an error creating the XMLReader + * @throws ParserConfigurationException If there's an error configuring the parser + */ + public static XMLReader getSecureXMLReader() throws SAXException, ParserConfigurationException { + SAXParserFactory spf = SAXParserFactory.newInstance(); + + // Configure the parser factory with security features + spf.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + spf.setFeature("http://xml.org/sax/features/external-general-entities", false); + spf.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + + // Create a secure parser + SAXParser parser = spf.newSAXParser(); + + // Get the XMLReader from the parser + XMLReader reader = parser.getXMLReader(); + + return reader; + } + + /** + * Creates and returns a secure XMLInputFactory with protection against XXE attacks. + * + * @return A secure XMLInputFactory instance + */ + public static XMLInputFactory getSecureXMLInputFactory() { + XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); + + // Set coalescing to merge CDATA sections with adjacent text nodes + xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); + + // Disable DTDs and external entities + xmlInputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE); + xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE); + + // Disable entity replacement + try { + xmlInputFactory.setProperty("javax.xml.stream.isSupportingExternalEntities", Boolean.FALSE); + xmlInputFactory.setProperty("javax.xml.stream.supportDTD", Boolean.FALSE); + } catch (IllegalArgumentException e) { + // Some implementations might not support these exact property names + logger.log(Level.FINE, "XMLInputFactory doesn't support some security properties, continuing with defaults", e); + } + + return xmlInputFactory; + } + +} \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlValidator.java b/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlValidator.java index cec64ab95b7..b599c82ce8b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlValidator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/xml/XmlValidator.java @@ -31,8 +31,20 @@ public static boolean validateXmlSchema(String fileToValidate, URL schemaToValid public static boolean validateXmlSchema(Source xmlFile, URL schemaToValidateAgainst) throws MalformedURLException, SAXException, IOException { SchemaFactory schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); + + try { + schemaFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + // Additional protection + schemaFactory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + schemaFactory.setFeature("http://xml.org/sax/features/external-general-entities", false); + schemaFactory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + } catch (SAXException e) { + logger.warning("Could not set XML security features: " + e.getMessage()); + } + Schema schema = schemaFactory.newSchema(schemaToValidateAgainst); Validator validator = schema.newValidator(); + try { validator.validate(xmlFile); logger.info(xmlFile.getSystemId() + " is valid"); @@ -52,7 +64,7 @@ public static boolean validateXmlSchema(Source xmlFile, URL schemaToValidateAgai * @throws Exception if the XML is not well-formed with a message about why. */ public static boolean validateXmlWellFormed(String filename) throws Exception { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilderFactory factory = XmlUtil.getSecureDocumentBuilderFactory(); factory.setValidating(false); factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); diff --git a/src/test/java/edu/harvard/iq/dataverse/api/DatasetsIT.java b/src/test/java/edu/harvard/iq/dataverse/api/DatasetsIT.java index 99e308c4bcd..2decd7b19d7 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/DatasetsIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/DatasetsIT.java @@ -13,6 +13,7 @@ import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.json.*; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; import io.restassured.RestAssured; import io.restassured.http.ContentType; import io.restassured.parsing.Parser; @@ -4088,7 +4089,7 @@ public void testCuratePublishedDatasetVersionCommand() throws IOException { Map mapVarToVarMet = new HashMap(); Map varGroupMap = new HashMap(); try { - XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLInputFactory factory = XmlUtil.getSecureXMLInputFactory(); XMLStreamReader xmlr = factory.createXMLStreamReader(variableData); VariableMetadataDDIParser vmdp = new VariableMetadataDDIParser(); diff --git a/src/test/java/edu/harvard/iq/dataverse/api/EditDDIIT.java b/src/test/java/edu/harvard/iq/dataverse/api/EditDDIIT.java index 2fa208ffa5c..6f5ee03aa92 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/EditDDIIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/EditDDIIT.java @@ -8,6 +8,7 @@ import edu.harvard.iq.dataverse.datavariable.VarGroup; import edu.harvard.iq.dataverse.datavariable.VariableMetadata; import edu.harvard.iq.dataverse.datavariable.VariableMetadataDDIParser; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -93,7 +94,7 @@ public void testUpdateVariableMetadata() throws InterruptedException { Map mapVarToVarMet = new HashMap(); Map varGroupMap = new HashMap(); try { - XMLInputFactory factory = XMLInputFactory.newInstance(); + XMLInputFactory factory = XmlUtil.getSecureXMLInputFactory(); XMLStreamReader xmlr = factory.createXMLStreamReader(variableData); VariableMetadataDDIParser vmdp = new VariableMetadataDDIParser(); diff --git a/src/test/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataDDIParserTest.java b/src/test/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataDDIParserTest.java index 475b4c1cff5..0d054f36c6a 100644 --- a/src/test/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataDDIParserTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/datavariable/VariableMetadataDDIParserTest.java @@ -2,6 +2,8 @@ import org.junit.jupiter.api.Test; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; + import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; @@ -25,7 +27,7 @@ public void testDDIReader() { String fileName = "src/test/resources/xml/dct.xml"; XMLStreamReader xmlr = null; - XMLInputFactory factory=XMLInputFactory.newInstance(); + XMLInputFactory factory = XmlUtil.getSecureXMLInputFactory(); try { xmlr = factory.createXMLStreamReader(new FileInputStream(fileName)); } catch (Exception e) { diff --git a/src/test/java/edu/harvard/iq/dataverse/export/OpenAireExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/OpenAireExporterTest.java index 2d06436fb33..efc18f5a1df 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/OpenAireExporterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/OpenAireExporterTest.java @@ -2,6 +2,7 @@ import io.restassured.path.xml.XmlPath; import edu.harvard.iq.dataverse.util.xml.XmlPrinter; +import edu.harvard.iq.dataverse.util.xml.XmlUtil; import io.gdcc.spi.export.ExportDataProvider; import io.gdcc.spi.export.XMLExporter; @@ -107,7 +108,7 @@ public void testValidateExportDataset() throws Exception { System.out.println("XML: " + xmlAsString); } InputStream xmlStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray()); - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilderFactory factory = XmlUtil.getSecureDocumentBuilderFactory(); //factory.setValidating(true); factory.setNamespaceAware(true); factory.setAttribute("http://java.sun.com/xml/jaxp/properties/schemaLanguage", diff --git a/src/test/java/edu/harvard/iq/dataverse/util/xml/XmlUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/xml/XmlUtilTest.java new file mode 100644 index 00000000000..883c5f78132 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/xml/XmlUtilTest.java @@ -0,0 +1,564 @@ +package edu.harvard.iq.dataverse.util.xml; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathFactory; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.XMLReader; +import org.xml.sax.helpers.DefaultHandler; + +public class XmlUtilTest { + String xml = """ + + + Hello + + """; + + @Test + void testRetrieveCurrentAsString() throws Exception { + // Create a secure XMLReader using XmlUtil + XMLReader xmlReader = XmlUtil.getSecureXMLReader(); + + // Create a handler to capture the parsed content + final StringBuilder parsedContent = new StringBuilder(); + final boolean[] foundElement = new boolean[1]; + + xmlReader.setContentHandler(new DefaultHandler() { + private boolean insideTwo = false; + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + if ("two".equals(localName) || "two".equals(qName)) { + insideTwo = true; + foundElement[0] = true; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + if (insideTwo) { + parsedContent.append(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if ("two".equals(localName) || "two".equals(qName)) { + insideTwo = false; + } + } + }); + + // Reset the input stream since we're reusing it + InputStream freshInputStream = new ByteArrayInputStream(xml.getBytes()); + + // Parse the XML + xmlReader.parse(new InputSource(freshInputStream)); + + // Verify the content was parsed correctly + assertTrue(foundElement[0], "The 'two' element should be found"); + assertEquals("Hello", parsedContent.toString().trim(), "The content of the 'two' element should be 'Hello'"); + + // If you want to use XPath to verify the structure, you can use DocumentBuilder instead + DocumentBuilder db = XmlUtil.getSecureDocumentBuilder(); + org.w3c.dom.Document doc = db.parse(new InputSource(new ByteArrayInputStream(xml.getBytes()))); + + // Use XPath to verify the structure + XPathFactory xpf = XPathFactory.newInstance(); + XPath xpath = xpf.newXPath(); + String result = xpath.evaluate("/test/one/two", doc); + + assertEquals("Hello", result, "XPath should find the content of the 'two' element"); + } + + @Test + void testSecurityConfiguration_EntityReferencesHandledSecurely() throws Exception { + // Test that XML with entity references is handled securely + + // First, test with predefined entities which should work fine + String xmlWithPredefinedEntities = "" + + "" + + " This text has <brackets> and an &ampersand;" + + ""; + + InputStream xmlStream = new ByteArrayInputStream(xmlWithPredefinedEntities.getBytes(StandardCharsets.UTF_8)); + + // Create a secure XMLReader using XmlUtil + XMLReader xmlReader = XmlUtil.getSecureXMLReader(); + + // Create a handler to capture the parsed content + final StringBuilder predefinedContent = new StringBuilder(); + final boolean[] inPredefined = new boolean[1]; + + xmlReader.setContentHandler(new DefaultHandler() { + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + String name = localName.isEmpty() ? qName : localName; + if ("predefined".equals(name)) { + inPredefined[0] = true; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + String content = new String(ch, start, length); + if (inPredefined[0]) { + predefinedContent.append(content); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + String name = localName.isEmpty() ? qName : localName; + if ("predefined".equals(name)) { + inPredefined[0] = false; + } + } + }); + + // Parse the XML with predefined entities - should not throw an exception + assertDoesNotThrow(() -> xmlReader.parse(new InputSource(xmlStream))); + + // Verify predefined entities were properly decoded + assertEquals("This text has and an &ersand;", + predefinedContent.toString().trim(), + "Predefined XML entities should be properly decoded"); + + // Now test with custom entity which should cause an exception + String xmlWithCustomEntity = "" + + "" + + " &customEntity;" + + ""; + + InputStream dbStream = new ByteArrayInputStream(xmlWithCustomEntity.getBytes(StandardCharsets.UTF_8)); + + // The parser should throw an exception for undeclared entities + // This is the expected secure behavior + Exception exception = assertThrows(org.xml.sax.SAXParseException.class, + () -> xmlReader.parse(new InputSource(dbStream))); + + // Verify the exception message mentions the undeclared entity + assertTrue(exception.getMessage().contains("customEntity"), + "Exception should mention the undeclared entity"); + + // Also test with DocumentBuilder for completeness + DocumentBuilder db = XmlUtil.getSecureDocumentBuilder(); + + // Test with predefined entities + org.w3c.dom.Document doc = db.parse(new InputSource(new ByteArrayInputStream( + xmlWithPredefinedEntities.getBytes(StandardCharsets.UTF_8)))); + + // Check that the elements exist with properly handled entities + org.w3c.dom.NodeList predefinedNodes = doc.getElementsByTagName("predefined"); + assertEquals(1, predefinedNodes.getLength(), "Predefined element should exist"); + + String predefinedText = predefinedNodes.item(0).getTextContent(); + assertEquals("This text has and an &ersand;", + predefinedText, + "Predefined XML entities should be properly decoded"); + + // Test with custom entity - should throw exception + Exception dbException = assertThrows(org.xml.sax.SAXParseException.class, + () -> db.parse(new InputSource(new ByteArrayInputStream( + xmlWithCustomEntity.getBytes(StandardCharsets.UTF_8))))); + + assertTrue(dbException.getMessage().contains("customEntity"), + "Exception should mention the undeclared entity"); + } + + @Test + void testSecurityConfiguration_DTDProcessingDisabled() throws Exception { + // Test that DTD features are properly disabled + // Instead of using DOCTYPE declarations (which might be rejected entirely), + // we'll verify the security features are properly configured + + // Create a secure XMLReader using XmlUtil + XMLReader xmlReader = XmlUtil.getSecureXMLReader(); + + // Verify that DTD-related features are properly disabled + assertTrue(xmlReader.getFeature("http://apache.org/xml/features/disallow-doctype-decl"), + "DOCTYPE declarations should be disallowed"); + assertFalse(xmlReader.getFeature("http://xml.org/sax/features/external-general-entities"), + "External general entities should be disabled"); + assertFalse(xmlReader.getFeature("http://xml.org/sax/features/external-parameter-entities"), + "External parameter entities should be disabled"); + + // Test with a simple XML without DOCTYPE to ensure basic parsing still works + String simpleXml = "Simple content"; + InputStream stream = new ByteArrayInputStream(simpleXml.getBytes(StandardCharsets.UTF_8)); + + // Create a handler to capture the parsed content + final StringBuilder parsedContent = new StringBuilder(); + final boolean[] rootElementFound = new boolean[1]; + + xmlReader.setContentHandler(new DefaultHandler() { + private boolean insideRoot = false; + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + if ("root".equals(localName) || "root".equals(qName)) { + insideRoot = true; + rootElementFound[0] = true; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + if (insideRoot) { + parsedContent.append(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if ("root".equals(localName) || "root".equals(qName)) { + insideRoot = false; + } + } + }); + + // Parse the XML - should not throw an exception + assertDoesNotThrow(() -> xmlReader.parse(new InputSource(stream))); + + // Verify the root element was found and content was parsed correctly + assertTrue(rootElementFound[0], "Root element should be found"); + assertEquals("Simple content", parsedContent.toString().trim(), + "Content should be parsed correctly"); + + // Also test with DocumentBuilder for completeness + DocumentBuilder db = XmlUtil.getSecureDocumentBuilder(); + + // Verify that DTD-related features are properly disabled in DocumentBuilderFactory + DocumentBuilderFactory dbf = XmlUtil.getSecureDocumentBuilderFactory(); + + assertTrue(dbf.getFeature("http://apache.org/xml/features/disallow-doctype-decl"), + "DOCTYPE declarations should be disallowed in DocumentBuilderFactory"); + assertFalse(dbf.getFeature("http://xml.org/sax/features/external-general-entities"), + "External general entities should be disabled in DocumentBuilderFactory"); + assertFalse(dbf.getFeature("http://xml.org/sax/features/external-parameter-entities"), + "External parameter entities should be disabled in DocumentBuilderFactory"); + + // Parse a simple XML to ensure basic functionality works + org.w3c.dom.Document doc = db.parse(new InputSource(new ByteArrayInputStream( + simpleXml.getBytes(StandardCharsets.UTF_8)))); + + // Check that the root element exists with correct content + org.w3c.dom.NodeList rootNodes = doc.getElementsByTagName("root"); + assertEquals(1, rootNodes.getLength(), "Root element should exist"); + assertEquals("Simple content", rootNodes.item(0).getTextContent(), + "Root element should have correct content"); + } + + @Test + void testSecurityConfiguration_EntityReferencesNotReplaced() throws Exception { + // Test that predefined entity references are handled safely + // Using predefined XML entities that don't require DOCTYPE declarations + String xmlWithPredefinedEntities = "" + + "This text has <brackets> and an &ampersand;"; + + InputStream stream = new ByteArrayInputStream(xmlWithPredefinedEntities.getBytes(StandardCharsets.UTF_8)); + + // Create a secure XMLReader using XmlUtil + XMLReader xmlReader = XmlUtil.getSecureXMLReader(); + + // Create a handler to capture the parsed content + final StringBuilder parsedContent = new StringBuilder(); + final boolean[] rootElementFound = new boolean[1]; + + xmlReader.setContentHandler(new DefaultHandler() { + private boolean insideRoot = false; + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + if ("root".equals(localName) || "root".equals(qName)) { + insideRoot = true; + rootElementFound[0] = true; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + if (insideRoot) { + parsedContent.append(ch, start, length); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if ("root".equals(localName) || "root".equals(qName)) { + insideRoot = false; + } + } + }); + + // Parse the XML - should not throw an exception + assertDoesNotThrow(() -> xmlReader.parse(new InputSource(stream))); + + // Verify the root element was found + assertTrue(rootElementFound[0], "Root element should be found"); + + // Verify the predefined entities were properly handled + // The parser should convert < to < and & to & as these are built-in XML entities + assertEquals("This text has and an &ersand;", parsedContent.toString().trim(), + "Predefined XML entities should be properly decoded"); + + // Also test with DocumentBuilder for completeness + assertDoesNotThrow(() -> { + DocumentBuilder db = XmlUtil.getSecureDocumentBuilder(); + org.w3c.dom.Document doc = db.parse(new InputSource(new ByteArrayInputStream( + xmlWithPredefinedEntities.getBytes(StandardCharsets.UTF_8)))); + + // Check that the root element exists with properly decoded entities + org.w3c.dom.NodeList rootNodes = doc.getElementsByTagName("root"); + assertEquals(1, rootNodes.getLength(), "Root element should exist"); + + String rootContent = rootNodes.item(0).getTextContent(); + assertEquals("This text has and an &ersand;", rootContent, + "Predefined XML entities should be properly decoded"); + }); + } + + @ParameterizedTest + @ValueSource(strings = { + // XML Bomb (Billion laughs attack) + "" + + "]>&lol3;", + // External entity attack + "" + + "]>" + + "&xxe;", + // Parameter entity attack + "%xxe;]>test" + }) + void testSecurityConfiguration_MaliciousXMLHandledSafely(String maliciousXml) throws Exception { + // Create a secure XMLReader using XmlUtil + XMLReader xmlReader = XmlUtil.getSecureXMLReader(); + + // Since DOCTYPE declarations are disallowed, parsing should throw an exception + InputStream xmlStream = new ByteArrayInputStream(maliciousXml.getBytes(StandardCharsets.UTF_8)); + Exception exception = assertThrows(org.xml.sax.SAXParseException.class, + () -> xmlReader.parse(new InputSource(xmlStream))); + + // Verify the exception message mentions DOCTYPE + assertTrue(exception.getMessage().contains("DOCTYPE") || + exception.getMessage().contains("doctype"), + "Exception should mention DOCTYPE being disallowed"); + + // Also test with DocumentBuilder for completeness + DocumentBuilder db = XmlUtil.getSecureDocumentBuilder(); + + // Create a new stream for the DocumentBuilder test + InputStream dbStream = new ByteArrayInputStream(maliciousXml.getBytes(StandardCharsets.UTF_8)); + + // DocumentBuilder should also throw an exception + Exception dbException = assertThrows(org.xml.sax.SAXParseException.class, + () -> db.parse(new InputSource(dbStream))); + + // Verify the exception message mentions DOCTYPE + assertTrue(dbException.getMessage().contains("DOCTYPE") || + dbException.getMessage().contains("doctype"), + "Exception should mention DOCTYPE being disallowed"); + } + + @Test + void testSecurityConfiguration_NormalXMLStillWorks() throws Exception { + // Test that normal XML processing still works correctly + String normalXml = "Test" + + " BookTest Author"; + + InputStream stream = new ByteArrayInputStream(normalXml.getBytes(StandardCharsets.UTF_8)); + + // Create a secure XMLReader using XmlUtil + XMLReader xmlReader = XmlUtil.getSecureXMLReader(); + + // Create a handler to capture the parsed content and structure + final List elementNames = new ArrayList<>(); + final String[] bookIdValue = new String[1]; + final StringBuilder titleContent = new StringBuilder(); + final StringBuilder authorContent = new StringBuilder(); + final boolean[] insideTitle = new boolean[1]; + final boolean[] insideAuthor = new boolean[1]; + + xmlReader.setContentHandler(new DefaultHandler() { + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + String elementName = localName.isEmpty() ? qName : localName; + elementNames.add(elementName); + + if ("book".equals(elementName)) { + bookIdValue[0] = attributes.getValue("id"); + } else if ("title".equals(elementName)) { + insideTitle[0] = true; + } else if ("author".equals(elementName)) { + insideAuthor[0] = true; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + String content = new String(ch, start, length); + if (insideTitle[0]) { + titleContent.append(content); + } else if (insideAuthor[0]) { + authorContent.append(content); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + String elementName = localName.isEmpty() ? qName : localName; + if ("title".equals(elementName)) { + insideTitle[0] = false; + } else if ("author".equals(elementName)) { + insideAuthor[0] = false; + } + } + }); + + // Parse the XML + xmlReader.parse(new InputSource(stream)); + + // Verify the structure was parsed correctly + assertEquals(4, elementNames.size(), "Should have found 4 elements"); + assertEquals("books", elementNames.get(0), "First element should be 'books'"); + assertEquals("book", elementNames.get(1), "Second element should be 'book'"); + assertEquals("title", elementNames.get(2), "Third element should be 'title'"); + assertEquals("author", elementNames.get(3), "Fourth element should be 'author'"); + + // Verify attribute was parsed correctly + assertEquals("1", bookIdValue[0], "Book ID should be '1'"); + + // Verify content was parsed correctly + assertEquals("Test Book", titleContent.toString().trim(), "Title content should be 'Test Book'"); + assertEquals("Test Author", authorContent.toString().trim(), "Author content should be 'Test Author'"); + + // Also test with DocumentBuilder for completeness + DocumentBuilder db = XmlUtil.getSecureDocumentBuilder(); + org.w3c.dom.Document doc = db.parse(new InputSource(new ByteArrayInputStream( + normalXml.getBytes(StandardCharsets.UTF_8)))); + + // Use XPath to verify the structure + XPathFactory xpf = XPathFactory.newInstance(); + XPath xpath = xpf.newXPath(); + + // Verify element structure + assertEquals("books", doc.getDocumentElement().getNodeName(), "Root element should be 'books'"); + assertEquals("1", xpath.evaluate("/books/book/@id", doc), "Book ID should be '1'"); + assertEquals("Test Book", xpath.evaluate("/books/book/title", doc), "Title should be 'Test Book'"); + assertEquals("Test Author", xpath.evaluate("/books/book/author", doc), "Author should be 'Test Author'"); + } + + @Test + void testSecureXMLInputFactory() throws Exception { + // Test that XMLInputFactory is properly secured and works correctly + + // Get the secure XMLInputFactory + XMLInputFactory xmlInputFactory = XmlUtil.getSecureXMLInputFactory(); + + // Verify security settings + assertFalse((Boolean) xmlInputFactory.getProperty(XMLInputFactory.SUPPORT_DTD), + "DTD support should be disabled"); + assertFalse((Boolean) xmlInputFactory.getProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES), + "External entity support should be disabled"); + assertTrue((Boolean) xmlInputFactory.getProperty(XMLInputFactory.IS_COALESCING), + "Coalescing should be enabled"); + + // Test with normal XML + String normalXml = "Test content"; + XMLStreamReader reader = xmlInputFactory.createXMLStreamReader(new StringReader(normalXml)); + + // Parse and verify content + StringBuilder content = new StringBuilder(); + String elementName = null; + + while (reader.hasNext()) { + int event = reader.next(); + + switch (event) { + case XMLStreamConstants.START_ELEMENT: + elementName = reader.getLocalName(); + break; + case XMLStreamConstants.CHARACTERS: + if ("element".equals(elementName)) { + content.append(reader.getText()); + } + break; + } + } + + assertEquals("Test content", content.toString(), "Content should be parsed correctly"); + + // Test with XML containing entity references + String xmlWithEntities = "Test <content>"; + reader = xmlInputFactory.createXMLStreamReader(new StringReader(xmlWithEntities)); + + // Parse and verify content + content = new StringBuilder(); + elementName = null; + + while (reader.hasNext()) { + int event = reader.next(); + + switch (event) { + case XMLStreamConstants.START_ELEMENT: + elementName = reader.getLocalName(); + break; + case XMLStreamConstants.CHARACTERS: + if ("element".equals(elementName)) { + content.append(reader.getText()); + } + break; + } + } + + assertEquals("Test ", content.toString(), "Entities should be properly decoded"); + + // Test with malicious XML (should throw exception) + String maliciousXml = "]>&xxe;"; + + XMLStreamException exception = assertThrows(XMLStreamException.class, () -> { + XMLStreamReader maliciousReader = xmlInputFactory.createXMLStreamReader(new StringReader(maliciousXml)); + // Read through the document + while (maliciousReader.hasNext()) { + maliciousReader.next(); + } + }); + + // The exception message might vary by implementation, but should indicate a problem with DOCTYPE or entities + assertTrue(exception.getMessage().contains("DOCTYPE") || + exception.getMessage().contains("doctype") || + exception.getMessage().contains("entity") || + exception.getMessage().contains("DTD"), + "Exception should indicate a security-related issue"); + } + +}