Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize XML tag names #2276

Merged
merged 2 commits into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
* When parsing with the XML parser, XML Declarations and Processing Instructions are directly handled, vs bouncing
through the HTML parser's bogus comment handler. Serialization for non-doctype declarations no longer end with a
spurious `!`. [2275](https://github.com/jhy/jsoup/pull/2275)
* When converting parsed HTML to XML or the W3C DOM, element names containing `<` are normalized to `_` to ensure valid
XML. For example, `<foo<bar>` becomes `<foo_bar>`, as XML does not allow `<` in element names, but HTML5
does. [2276](https://github.com/jhy/jsoup/pull/2276)

### Bug Fixes

Expand Down
9 changes: 3 additions & 6 deletions src/main/java/org/jsoup/helper/W3CDom.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.jsoup.helper;

import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
Expand Down Expand Up @@ -377,12 +378,7 @@ public void head(org.jsoup.nodes.Node source, int depth) {

String prefix = updateNamespaces(sourceEl);
String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null;
String tagName = sourceEl.tagName();

/* Tag names in XML are quite permissive, but less permissive than HTML. Rather than reimplement the validation,
we just try to use it as-is. If it fails, insert as a text node instead. We don't try to normalize the
tagname to something safe, because that isn't going to be meaningful downstream. This seems(?) to be
how browsers handle the situation, also. https://github.com/jhy/jsoup/issues/1093 */
String tagName = Normalizer.xmlSafeTagName(sourceEl.tagName());
try {
// use an empty namespace if none is present but the tag name has a prefix
String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace;
Expand All @@ -393,6 +389,7 @@ public void head(org.jsoup.nodes.Node source, int depth) {
doc.setUserData(ContextNodeProperty, el, null);
dest = el; // descend
} catch (DOMException e) {
// If the Normalize didn't get it XML / W3C safe, inserts as plain text
append(doc.createTextNode("<" + tagName + ">"), sourceEl);
}
} else if (source instanceof org.jsoup.nodes.TextNode) {
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/org/jsoup/internal/Normalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,10 @@ public static String normalize(final String input) {
public static String normalize(final String input, boolean isStringLiteral) {
return isStringLiteral ? lowerCase(input) : normalize(input);
}

/** Minimal helper to get an otherwise OK HTML name like "foo<bar" to "foo_bar". */
public static String xmlSafeTagName(final String tagname) {
// todo - if required we could make a fuller version of this as in Attribute.getValidKey(syntax) in Element. for now, just minimal based on what HtmlTreeBuilder produces
return tagname.replace('<', '_');
}
}
14 changes: 11 additions & 3 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.jsoup.helper.ChangeNotifyingArrayList;
import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
Expand Down Expand Up @@ -36,6 +37,8 @@
import java.util.stream.Stream;

import static org.jsoup.internal.Normalizer.normalize;
import static org.jsoup.nodes.Document.OutputSettings.Syntax.html;
import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
import static org.jsoup.nodes.TextNode.lastCharIsWhitespace;
import static org.jsoup.parser.Parser.NamespaceHtml;
import static org.jsoup.parser.TokenQueue.escapeCssIdentifier;
Expand Down Expand Up @@ -1792,12 +1795,12 @@ void outerHtmlHead(final Appendable accum, int depth, final Document.OutputSetti
indent(accum, depth, out);
}
}
accum.append('<').append(tagName());
accum.append('<').append(safeTagName(out.syntax()));
if (attributes != null) attributes.html(accum, out);

// selfclosing includes unknown tags, isEmpty defines tags that are always empty
if (childNodes.isEmpty() && tag.isSelfClosing()) {
if (out.syntax() == Document.OutputSettings.Syntax.html && tag.isEmpty())
if (out.syntax() == html && tag.isEmpty())
accum.append('>');
else
accum.append(" />"); // <img> in html, <img /> in xml
Expand All @@ -1814,10 +1817,15 @@ void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) thr
(out.outline() && (childNodes.size()>1 || (childNodes.size()==1 && (childNodes.get(0) instanceof Element))))
)))
indent(accum, depth, out);
accum.append("</").append(tagName()).append('>');
accum.append("</").append(safeTagName(out.syntax())).append('>');
}
}

/* If XML syntax, normalizes < to _ in tag name. */
private String safeTagName(Document.OutputSettings.Syntax syntax) {
return syntax == xml ? Normalizer.xmlSafeTagName(tagName()) : tagName();
}

/**
* Retrieves the element's inner HTML. E.g. on a {@code <div>} with one empty {@code <p>}, would return
* {@code <p></p>}. (Whereas {@link #outerHtml()} would return {@code <div><p></p></div>}.)
Expand Down
46 changes: 42 additions & 4 deletions src/test/java/org/jsoup/helper/W3CDomTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import java.util.Locale;
import java.util.Map;

import static org.jsoup.TextUtil.normalizeSpaces;
import static org.jsoup.nodes.Document.OutputSettings.Syntax.xml;
import static org.junit.jupiter.api.Assertions.*;

public class W3CDomTest {
Expand Down Expand Up @@ -137,7 +139,6 @@ public void namespacePreservation() throws IOException {
assertEquals("http://www.w3.org/1999/xhtml", img.getNamespaceURI());
assertEquals("img", img.getLocalName());
assertEquals("img", img.getNodeName());

}

@Test
Expand Down Expand Up @@ -171,7 +172,7 @@ public void xmlInputDocMaintainsHtmlAttributeNames() {
String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names coerced</p></body></html>";
org.jsoup.nodes.Document jsoupDoc;
jsoupDoc = Jsoup.parse(html);
jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
jsoupDoc.outputSettings().syntax(xml);

Document w3Doc = W3CDom.convert(jsoupDoc);
String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
Expand All @@ -188,6 +189,43 @@ public void handlesInvalidTagAsText() {
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>&lt;インセンティブで高収入!&gt;Text <p>More</p></body></html>", xml);
}

@Test void handlesHtmlElsWithLt() {
// In HTML, elements can be named "foo<bar" (<foo<bar>). Test that we can convert to W3C, that we can HTML parse our HTML serial, XML parse our XML serial, and W3C XML parse the XML serial and the W3C serial
// And similarly attributes may have "<" in their name
// https://github.com/jhy/jsoup/issues/2259
String input = "<foo<bar attr<name=\"123\"><b>Text</b></foo<bar>";
String xmlExpect = "<foo_bar attr_name=\"123\"><b>Text</b></foo_bar>"; // rewrites < to _ in el and attr

// html round trips
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(input);
String htmlSerial = htmlDoc.body().html();
assertEquals(input, normalizeSpaces(htmlSerial)); // same as input
Element htmlRound = Jsoup.parse(htmlSerial).body();
assertTrue(htmlDoc.body().hasSameValue(htmlRound));

// xml round trips
htmlDoc.outputSettings().syntax(xml);
String asXml = htmlDoc.body().html();
assertEquals(xmlExpect, normalizeSpaces(asXml)); // <foo<bar> -> <foo_bar>
org.jsoup.nodes.Document xmlDoc = Jsoup.parse(asXml);
String xmlSerial = xmlDoc.body().html();
assertEquals(xmlExpect, normalizeSpaces(xmlSerial)); // same as xmlExpect
Element xmlRound = Jsoup.parse(xmlSerial).body();
assertTrue(xmlDoc.body().hasSameValue(xmlRound));

// Can W3C parse that XML
Document w3cXml = parseXml(asXml, true);
NodeList w3cXmlNodes = w3cXml.getElementsByTagName("foo_bar");
assertEquals(1, w3cXmlNodes.getLength());
assertEquals("123", w3cXmlNodes.item(0).getAttributes().getNamedItem("attr_name").getTextContent());

// Can convert to W3C
Document w3cDoc = W3CDom.convert(htmlDoc);
NodeList w3cNodes = w3cDoc.getElementsByTagName("foo_bar");
assertEquals(1, w3cNodes.getLength());
assertEquals("123", w3cNodes.item(0).getAttributes().getNamedItem("attr_name").getTextContent());
}

@Test
public void canConvertToCustomDocument() throws ParserConfigurationException {
org.jsoup.nodes.Document document = Jsoup.parse("<html><div></div></html>");
Expand Down Expand Up @@ -317,7 +355,7 @@ private String output(String in, boolean modeHtml) {
Document w3c = W3CDom.convert(jdoc);

Map<String, String> properties = modeHtml ? W3CDom.OutputHtml() : W3CDom.OutputXml();
return TextUtil.normalizeSpaces(W3CDom.asString(w3c, properties));
return normalizeSpaces(W3CDom.asString(w3c, properties));
}

private void assertEqualsIgnoreCase(String want, String have) {
Expand Down Expand Up @@ -364,7 +402,7 @@ public void canOutputHtmlWithoutNamespace() {
@Test public void canXmlParseCdataNodes() throws XPathExpressionException {
String html = "<p><script>1 && 2</script><style>3 && 4</style> 5 &amp;&amp; 6</p>";
org.jsoup.nodes.Document jdoc = Jsoup.parse(html);
jdoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
jdoc.outputSettings().syntax(xml);
String xml = jdoc.body().html();
assertTrue(xml.contains("<script>//<![CDATA[\n1 && 2\n//]]></script>")); // as asserted in ElementTest
Document doc = parseXml(xml, false);
Expand Down
Loading