1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173 |
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x75
x75
x75
x75
x75
x75
x75
x75
x11
x280
x370
x370
x370
x370
x370
x370
x370
x370
x482
x510
x510
x538
x538
x538
x538
x525
x525
x482
x518
x518
x532
x532
x532
x532
x540
x540
x586
x623
x623
x657
x657
x626
x626
x626
x626
x493
x493
x493
x370
x280
x11
x33
x42
x33
x11
x26
x35
x26 |
|
// Copyright 2018-2026 the Deno authors. MIT license.
// This module is browser compatible.
/**
* Internal module for XML entity encoding and decoding.
*
* @module
*/
// Hoisted regex patterns for performance
// Single-pass regex that matches predefined entities and char refs, while
// also detecting bare/invalid ampersands. Uses [a-zA-Z]+ (not [a-zA-Z0-9]*)
// to allow entity names with digits to pass through unchanged (non-validating).
// Group 1: named entity (letters only, e.g. "amp")
// Group 2: decimal char ref (e.g. "#13")
// Group 3: hex char ref (e.g. "#xd")
// Match with no groups: bare/invalid ampersand (if lookahead fails)
const ENTITY_OR_AMPERSAND_RE =
/&(?:([a-zA-Z]+);|(#[0-9]+);|(#x[0-9a-fA-F]+);|(?![a-zA-Z][a-zA-Z0-9]*;|#[0-9]+;|#x[0-9a-fA-F]+;))/g;
const SPECIAL_CHARS_RE = /[<>&'"]/g;
const ATTR_ENCODE_RE = /[<>&'"\t\n\r]/g;
/** XML 1.0 §4.6 predefined entities (decode). */
const NAMED_ENTITIES: Record<string, string> = {
lt: "<",
gt: ">",
amp: "&",
apos: "'",
quot: '"',
};
/** XML 1.0 §4.6 predefined entities (encode). */
const ENTITY_MAP: Record<string, string> = {
"<": "<",
">": ">",
"&": "&",
"'": "'",
'"': """,
};
/** Entity map extended with whitespace for attribute values (§3.3.3). */
const ATTR_ENTITY_MAP: Record<string, string> = {
...ENTITY_MAP,
"\t": "	",
"\n": " ",
"\r": " ",
};
/**
* Checks if a code point is a valid XML 1.0 Char per §2.2.
*
* Per the specification:
* Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*
* This excludes:
* - NULL (#x0)
* - Control characters #x1-#x8, #xB-#xC, #xE-#x1F
* - Surrogate pairs #xD800-#xDFFF (handled separately)
* - Non-characters #xFFFE-#xFFFF
*
* @see {@link https://www.w3.org/TR/xml/#charsets | XML 1.0 §2.2 Characters}
*/
function isValidXmlChar(codePoint: number): boolean {
return (
codePoint === 0x9 ||
codePoint === 0xA ||
codePoint === 0xD ||
(codePoint >= 0x20 && codePoint <= 0xD7FF) ||
(codePoint >= 0xE000 && codePoint <= 0xFFFD) ||
(codePoint >= 0x10000 && codePoint <= 0x10FFFF)
);
}
/**
* Decodes XML entities in a string.
*
* This parser only supports the five predefined XML entities (§4.6)
* and numeric character references (§4.1). Custom entities defined in DTD are
* NOT expanded - this is a deliberate design choice for:
* - Security: Prevents entity expansion attacks (billion laughs, etc.)
* - Simplicity: No need to track DTD entity definitions
* - Consistency: Matches behavior of popular parsers like saxes
*
* External entities (SYSTEM/PUBLIC) are also not supported.
*
* @param text The text containing XML entities to decode.
* @returns The text with predefined entities decoded.
* @throws {Error} If the text contains invalid or unknown entity references.
*/
export function decodeEntities(text: string): string {
// Fast path: no ampersand means no entities to decode
if (!text.includes("&")) return text;
// Single-pass: decode predefined entities and char refs, error on invalid
return text.replace(
ENTITY_OR_AMPERSAND_RE,
(
match: string,
namedEntity: string | undefined,
decimalRef: string | undefined,
hexRef: string | undefined,
offset: number,
) => {
// Hex character reference (&#xNN;)
if (hexRef !== undefined) {
const codePoint = parseInt(hexRef.slice(2), 16);
if (!isValidXmlChar(codePoint)) {
throw new Error(
`Invalid character reference '${match}' at position ${offset}: ` +
`code point ${codePoint} is not a valid XML character`,
);
}
return String.fromCodePoint(codePoint);
}
// Decimal character reference (&#NN;)
if (decimalRef !== undefined) {
const codePoint = parseInt(decimalRef.slice(1), 10);
if (!isValidXmlChar(codePoint)) {
throw new Error(
`Invalid character reference '${match}' at position ${offset}: ` +
`code point ${codePoint} is not a valid XML character`,
);
}
return String.fromCodePoint(codePoint);
}
// Named entity (&name;) - only letters matched
if (namedEntity !== undefined) {
const predefined = NAMED_ENTITIES[namedEntity];
if (predefined !== undefined) {
return predefined;
}
// Unknown letter-only entity
throw new Error(
`Unknown entity '${match}' at position ${offset}: ` +
`only predefined entities (lt, gt, amp, apos, quot) are supported`,
);
}
// Bare ampersand (no valid entity pattern matched)
throw new Error(
`Invalid bare '&' at position ${offset}: ` +
`entity references must be &name; or &#num; or &#xHex;`,
);
},
);
}
/**
* Encodes special characters as XML entities.
*
* @param text The text to encode.
* @returns The text with special characters encoded as entities.
*/
export function encodeEntities(text: string): string {
// Fast path: no special characters means nothing to encode
if (!/[<>&'"]/.test(text)) return text;
return text.replace(SPECIAL_CHARS_RE, (c) => ENTITY_MAP[c]!);
}
/**
* Encodes special characters for use in XML attribute values.
* Encodes whitespace characters that would be normalized per XML 1.0 §3.3.3.
*
* @param value The attribute value to encode.
* @returns The encoded attribute value.
*/
export function encodeAttributeValue(value: string): string {
// Fast path: no special characters means nothing to encode
if (!/[<>&'"\t\n\r]/.test(value)) return value;
return value.replace(ATTR_ENCODE_RE, (c) => ATTR_ENTITY_MAP[c]!);
}
|