apache · olabusayoT · Apr 2, 2026 · Apr 14, 2026 · May 16, 2026 · May 18, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -14,4 +14,6 @@
 # limitations under the License.
 
 # Do not include KEYS in archived source releases
-/KEYS export-ignore 
+/KEYS export-ignore
+# ensure stringAsXml file line endings are not normalized in windows
+/daffodil-test/src/test/resources/org/apache/daffodil/infoset/stringAsXml/namespaced/binMessage_01.dat.xml -text
diff --git a/daffodil-core/src/main/resources/org/apache/daffodil/xsd/tdml.xsd b/daffodil-core/src/main/resources/org/apache/daffodil/xsd/tdml.xsd
@@ -224,11 +224,21 @@
   </simpleType>
 
   <simpleType name="validationType">
-    <restriction base="xs:token">
-      <enumeration value="on"/>
-      <enumeration value="limited"/>
-      <enumeration value="off"/>
-    </restriction>
+    <union>
+      <simpleType>
+        <restriction base="xs:token">
+          <enumeration value="on"/>
+          <enumeration value="limited"/>
+          <enumeration value="off"/>
+        </restriction>
+      </simpleType>
+
+      <simpleType>
+        <restriction base="xs:token">
+          <pattern value="[A-Za-z0-9_]+"/>
+        </restriction>
+      </simpleType>
+    </union>
   </simpleType>
 
   <element name="document" type="tns:documentType"/>

diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/lib/xml/DaffodilConstructingLoader.scala b/daffodil-core/src/main/scala/org/apache/daffodil/lib/xml/DaffodilConstructingLoader.scala
@@ -94,13 +94,16 @@ object Position {
  *                          behavior of normalizing CRLF to LF, and solitary CR to LF.
  *                          Defaults to true. Should only be changed in special circumstances
  *                          as not normalizing CRLFs is non-standard for XML.
- *
+ * @param removeComments True to remove comments. This is used to keep the XML as close to the original as possible
+ * @param removeProcInstr True to remove processing instructions. This is used to keep the XML as close to the original as possible
  */
 class DaffodilConstructingLoader private[xml] (
   uri: URI,
   errorHandler: org.xml.sax.ErrorHandler,
   addPositionAttributes: Boolean,
-  normalizeCRLFtoLF: Boolean
+  normalizeCRLFtoLF: Boolean,
+  removeComments: Boolean,
+  removeProcInstr: Boolean
 ) extends ConstructingParser(
     {
       // Note: we must open the XML carefully since it might be in some non
@@ -122,7 +125,14 @@ class DaffodilConstructingLoader private[xml] (
     errorHandler: org.xml.sax.ErrorHandler,
     addPositionAttributes: Boolean = false
   ) =
-    this(uri, errorHandler, addPositionAttributes, normalizeCRLFtoLF = true)
+    this(
+      uri,
+      errorHandler,
+      addPositionAttributes,
+      normalizeCRLFtoLF = true,
+      removeComments = true,
+      removeProcInstr = true
+    )
 
   /**
    * Ensures that DOCTYPES aka DTDs, if encountered, are rejected.
@@ -316,19 +326,30 @@ class DaffodilConstructingLoader private[xml] (
   }
 
   /**
-   * Drops comments
+   * Drops comments if removeComments is true
+   *
+   * This is optional controlled by a constructor parameter.
    */
   override def comment(pos: Int, s: String): Comment = {
-    // returning null drops comments
-    null
+    if (removeComments) {
+      // returning null drops comments
+      null
+    } else {
+      super.comment(pos, s)
+    }
   }
 
   /**
-   * Drops processing instructions
+   * Drops processing instructions if removeProcInstr is false
+   *
+   * This is optional controlled by a constructor parameter.
    */
   override def procInstr(pos: Int, target: String, txt: String) = {
-    // returning null drops processing instructions
-    null
+    if (removeProcInstr) { // returning null drops processing instructions
+      null
+    } else {
+      super.procInstr(pos, target, txt)
+    }
   }
 
   private def parseXMLPrologAttributes(

diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala b/daffodil-core/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala
@@ -702,31 +702,20 @@ class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
    * @param optSchemaURI Optional URI for XML schema for the XML source document.
    * @param addPositionAttributes True to add dafint:file dafint:line attributes to all elements.
    *                              Defaults to false.
-   * @return an scala.xml.Node (Element actually) which is the document element of the source.
-   */
-  def load(
-    source: DaffodilSchemaSource,
-    optSchemaURI: Option[URI],
-    addPositionAttributes: Boolean = false
-  ): scala.xml.Node =
-    load(source, optSchemaURI, addPositionAttributes, normalizeCRLFtoLF = true)
-
-  /**
-   * package private constructor gives access to normalizeCRLFtoLF feature.
-   *
-   * @param source The URI for the XML document which may be a XML or DFDL schema, or just XML data.
-   * @param optSchemaURI Optional URI for XML schema for the XML source document.
-   * @param addPositionAttributes True to add dafint:file dafint:line attributes to all elements.
-   *                              Defaults to false.
    * @param normalizeCRLFtoLF True to normalize CRLF and isolated CR to LF. This should usually be true,
    *                          but some special case situations may require preservation of CRLF/CR.
+   * @param removeComments True to remove comments. This is used to keep the XML as close to the original as possible
+   * @param removeProcInstr True to remove processing instructions. This is used to keep the XML as close to the original as possible
+   *
    * @return an scala.xml.Node (Element actually) which is the document element of the source.
    */
-  private[xml] def load(
+  def load(
     source: DaffodilSchemaSource,
     optSchemaURI: Option[URI],
-    addPositionAttributes: Boolean,
-    normalizeCRLFtoLF: Boolean
+    addPositionAttributes: Boolean = false,
+    normalizeCRLFtoLF: Boolean = true,
+    removeComments: Boolean = true,
+    removeProcInstr: Boolean = true
   ): scala.xml.Node = {
     //
     // First we invoke the validator to explicitly validate the XML against
@@ -819,7 +808,9 @@ class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler)
         source.uriForLoading,
         errorHandler,
         addPositionAttributes,
-        normalizeCRLFtoLF
+        normalizeCRLFtoLF,
+        removeComments,
+        removeProcInstr
       )
     val res =
       try {

diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/lib/xml/XMLUtils.scala b/daffodil-core/src/main/scala/org/apache/daffodil/lib/xml/XMLUtils.scala
@@ -42,6 +42,8 @@ import org.apache.daffodil.lib.iapi.URISchemaSource
 import org.apache.daffodil.lib.schema.annotation.props.LookupLocation
 import org.apache.daffodil.lib.util.Maybe
 import org.apache.daffodil.lib.util.Misc
+import org.apache.daffodil.runtime1.infoset.InvalidInfosetException
+import org.apache.daffodil.runtime1.infoset.XMLTextInfoset
 
 import org.apache.commons.io.IOUtils
 import org.xml.sax.XMLReader
@@ -599,6 +601,7 @@ object XMLUtils {
 
   def removeComments(e: Node): Node = {
     e match {
+      case x: Elem if isStringAsXmlElem(x) => x
       case Elem(prefix, label, attribs, scope, child*) => {
         val newChildren = child.filterNot { _.isInstanceOf[Comment] }.map { removeComments(_) }
         Elem(prefix, label, attribs, scope, true, newChildren*)
@@ -638,40 +641,108 @@ object XMLUtils {
     res
   }
 
+  private def isStringAsXmlElem(ns: Node): Boolean = {
+    ns match {
+      case e @ Elem(
+            null,
+            XMLTextInfoset.stringAsXml,
+            Null,
+            NamespaceBinding(null, null | "", _),
+            _*
+          ) =>
+        true
+      case _ => false
+    }
+  }
+
+  /**
+   * normalizes CRLF to LF within text nodes in non-stringAsXML elements
+   *
+   * Some fields in infosets could contain LFs, but could be changed to CRLF
+   * in Windows due to git's autocrlf feature. And since infoset outputters
+   * always output LF we need to undo with git might do and normalize those CRLF's
+   * to LF.
+   */
+  private def normalizeCRLFtoLF(ns: Node): Node = {
+    ns match {
+      // NOTE: this is specifically for the stringAsXml feature as we avoid
+      // making changes to any of its children requiring that stringAsXml in
+      // the infoset match results exactly.
+      case e: Elem if isStringAsXmlElem(e) => e
+      case e: Elem => {
+        val children = e.child
+        val normalized = children.map(normalizeCRLFtoLF)
+        val res = {
+          if (normalized eq children) e
+          else e.copy(child = normalized)
+        }
+        res
+      }
+      case Text(data) if data.contains("\r") => {
+        val replaced = data.replaceAll("\r\n", "\n").replaceAll("\r", "\n")
+        Text(replaced)
+      }
+      case _ => ns
+    }
+  }
+
   /**
    * removes insignificant whitespace from between elements
    */
 
   private def removeMixedWhitespace(ns: Node): Node = {
-    if (!ns.isInstanceOf[Elem]) return ns
-    val e = ns.asInstanceOf[Elem]
-    val children = e.child
-    val noMixedChildren =
-      if (children.exists(_.isInstanceOf[Elem])) {
-        children
-          .filter {
-            case Text(data) if data.matches("""\s*""") => false
-            case Text(data) =>
-              throw new Exception("Element %s contains mixed data: %s".format(e.label, data))
-            case _ => true
-          }
-          .map(removeMixedWhitespace)
-      } else {
-        children.filter {
-          //
-          // So this is a bit strange, but we're dropping nodes that are Empty String.
-          //
-          // In XML we cannot tell <foo></foo> where there is a Text("") child, from <foo></foo> with Nil children
-          //
-          case Text("") => false // drop empty strings
-          case _ => true
+    ns match {
+      // NOTE: this is specifically for the stringAsXml feature as we avoid
+      // making changes to any of its children except removing any surrounding
+      // whitespace, requiring that stringAsXml in the infoset match results exactly.
+      case e: Elem if isStringAsXmlElem(e) => {
+        val (elemChildren, nonElemChildren) = e.child.partition {
+          _.isInstanceOf[Elem]
+        }
+        if (elemChildren.length != 1)
+          throw new InvalidInfosetException("stringAsXml must contain a single child element.")
+        nonElemChildren.foreach {
+          case Text(data) if data.matches("""\s*""") => // no-op, empty text siblings are fine
+          case x =>
+            throw new Exception(
+              "%s is some kind of mixed content not allowed as a stringAsXml child".format(x)
+            )
         }
+        e.asInstanceOf[Elem].copy(child = elemChildren)
       }
+      case e: Elem => {
+        val children = e.child
+        val noMixedChildren =
+          if (children.exists(_.isInstanceOf[Elem])) {
+            children
+              .filter {
+                case Text(data) if data.matches("""\s*""") => false
+                case Text(data) =>
+                  throw new Exception(
+                    "Element %s contains mixed data: %s".format(e.label, data)
+                  )
+                case _ => true
+              }
+              .map(removeMixedWhitespace)
+          } else {
+            children.filter {
+              //
+              // So this is a bit strange, but we're dropping nodes that are Empty String.
+              //
+              // In XML we cannot tell <foo></foo> where there is a Text("") child, from <foo></foo> with Nil children
+              //
+              case Text("") => false // drop empty strings
+              case _ => true
+            }
+          }
 
-    val res =
-      if (noMixedChildren eq children) e
-      else e.copy(child = noMixedChildren)
-    res
+        val res =
+          if (noMixedChildren eq children) e
+          else e.copy(child = noMixedChildren)
+        res
+      }
+      case _ => ns
+    }
   }
 
   /**
@@ -700,6 +771,15 @@ object XMLUtils {
   ): NodeSeq = {
     val res = n match {
 
+      case e @ Elem(
+            null,
+            XMLTextInfoset.stringAsXml,
+            Null,
+            NamespaceBinding(null, null | "", _),
+            _*
+          ) =>
+        e
+
       case e @ Elem(prefix, label, attributes, scope, children*) => {
 
         val filteredScope = if (ns.length > 0) filterScope(scope, ns) else xml.TopScope
@@ -808,7 +888,8 @@ object XMLUtils {
     val noPCData = convertPCDataToText(noComments)
     val combinedText = coalesceAllAdjacentTextNodes(noPCData)
     val noMixedWS = removeMixedWhitespace(combinedText)
-    noMixedWS
+    val noCRLFs = normalizeCRLFtoLF(noMixedWS)
+    noCRLFs
   }
 
   class XMLDifferenceException(message: String) extends Exception(message)
@@ -973,6 +1054,15 @@ Differences were (path, expected, actual):
         } else if (checkPrefixes && prefixA != prefixB) {
           // different prefix
           List((zPath + "/" + labelA + "@prefix", prefixA, prefixB))
+        } else if (checkPrefixes && a.scope.getURI(prefixA) != b.scope.getURI(prefixB)) {
+          // prefixes doesn't resolve to same namespace
+          List(
+            (
+              zPath + "/" + labelA + "@prefix-namespace",
+              a.scope.getURI(prefixA),
+              b.scope.getURI(prefixB)
+            )
+          )
         } else if (checkNamespaces && mappingsA != mappingsB) {
           // different namespace bindings
           List((zPath + "/" + labelA + "@xmlns", mappingsA, mappingsB))
@@ -1055,6 +1145,28 @@ Differences were (path, expected, actual):
           computeTextDiff(zPath, tA, tB, maybeType, maybeFloatEpsilon, maybeDoubleEpsilon)
         thisDiff
       }
+      case (cA: Comment, cB: Comment) => {
+        val thisDiff = computeTextDiff(
+          zPath + "/@comment",
+          cA.toString,
+          cB.toString,
+          None,
+          None,
+          None
+        )
+        thisDiff
+      }
+      case (pcA: PCData, pcB: PCData) => {
+        val thisDiff = computeTextDiff(
+          zPath + "/@PCDATA",
+          pcA.toString,
+          pcB.toString,
+          None,
+          None,
+          None
+        )
+        thisDiff
+      }
       case (pA: ProcInstr, pB: ProcInstr) => {
         val ProcInstr(tA1label, tA1content) = pA
         val ProcInstr(tB1label, tB1content) = pB