1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166 |
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x11
x64
x64
x64
x64
x64
x64
x64
x64
x11
x354
x90
x90
x90
x90
x90
x90
x90
x90
x112
x28
x28
x13
x13
x13
x13
x15
x15
x112
x36
x36
x14
x14
x14
x14
x22
x22
x104
x37
x37
x34
x34
x3
x3
x3
x3
x11
x11
x11
x90
x354
x11
x22
x9
x22
x11
x15
x9
x15 |
|
// Copyright 2018-2026 the Deno authors. MIT license.
// This module is browser compatible.
/**
* Internal module for XML entity encoding and decoding.
*
* @module
*/
// Single-pass regex that matches all entity references and bare ampersands.
// Group 1: named entity ([a-zA-Z][a-zA-Z0-9]*, e.g. "amp", "foo1")
// Group 2: decimal char ref (e.g. "#13")
// Group 3: hex char ref (e.g. "#xd")
// No groups: bare/invalid ampersand
const ENTITY_OR_AMPERSAND_REGEXP =
/&(?:([a-zA-Z][a-zA-Z0-9]*);|(#[0-9]+);|(#x[0-9a-fA-F]+);)?/g;
const SPECIAL_CHARS_REGEXP = /[<>&'"]/g;
const ATTR_ENCODE_REGEXP = /[<>&'"\t\n\r]/g;
/** XML 1.0 §4.6 predefined entities (decode). */
const NAMED_ENTITIES: Record<string, string> = {
lt: "<",
gt: ">",
amp: "&",
apos: "'",
quot: '"',
};
/** XML 1.0 §4.6 predefined entities (encode). */
const ENTITY_MAP: Record<string, string> = {
"<": "<",
">": ">",
"&": "&",
"'": "'",
'"': """,
};
/** Entity map extended with whitespace for attribute values (§3.3.3). */
const ATTR_ENTITY_MAP: Record<string, string> = {
...ENTITY_MAP,
"\t": "	",
"\n": " ",
"\r": " ",
};
/**
* Checks if a code point is a valid XML 1.0 Char per §2.2.
*
* Per the specification:
* Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
*
* This excludes:
* - NULL (#x0)
* - Control characters #x1-#x8, #xB-#xC, #xE-#x1F
* - Surrogate pairs #xD800-#xDFFF (handled separately)
* - Non-characters #xFFFE-#xFFFF
*
* @see {@link https://www.w3.org/TR/xml/#charsets | XML 1.0 §2.2 Characters}
*/
function isValidXmlChar(codePoint: number): boolean {
return (
codePoint === 0x9 ||
codePoint === 0xA ||
codePoint === 0xD ||
(codePoint >= 0x20 && codePoint <= 0xD7FF) ||
(codePoint >= 0xE000 && codePoint <= 0xFFFD) ||
(codePoint >= 0x10000 && codePoint <= 0x10FFFF)
);
}
/**
* Decodes XML entities in a string.
*
* This parser only supports the five predefined XML entities (§4.6)
* and numeric character references (§4.1). Custom entities defined in DTD are
* NOT expanded - this is a deliberate design choice for:
* - Security: Prevents entity expansion attacks (billion laughs, etc.)
* - Simplicity: No need to track DTD entity definitions
* - Consistency: Matches behavior of popular parsers like saxes
*
* External entities (SYSTEM/PUBLIC) are also not supported.
*
* @returns The text with predefined entities decoded.
* @throws {Error} If the text contains invalid or unknown entity references.
*/
export function decodeEntities(text: string): string {
// Fast path: no ampersand means no entities to decode
if (!text.includes("&")) return text;
// Single-pass: decode predefined entities and char refs, error on invalid
return text.replace(
ENTITY_OR_AMPERSAND_REGEXP,
(
match: string,
namedEntity: string | undefined,
decimalRef: string | undefined,
hexRef: string | undefined,
offset: number,
) => {
// Hex character reference (&#xNN;)
if (hexRef !== undefined) {
const codePoint = parseInt(hexRef.slice(2), 16);
if (!isValidXmlChar(codePoint)) {
throw new Error(
`Invalid character reference '${match}' at position ${offset}: ` +
`code point ${codePoint} is not a valid XML character`,
);
}
return String.fromCodePoint(codePoint);
}
// Decimal character reference (&#NN;)
if (decimalRef !== undefined) {
const codePoint = parseInt(decimalRef.slice(1), 10);
if (!isValidXmlChar(codePoint)) {
throw new Error(
`Invalid character reference '${match}' at position ${offset}: ` +
`code point ${codePoint} is not a valid XML character`,
);
}
return String.fromCodePoint(codePoint);
}
// Named entity (&name;)
if (namedEntity !== undefined) {
const predefined = NAMED_ENTITIES[namedEntity];
if (predefined !== undefined) {
return predefined;
}
throw new Error(
`Unknown entity '${match}' at position ${offset}: ` +
`only predefined entities (lt, gt, amp, apos, quot) are recognized`,
);
}
// Bare ampersand (no valid entity pattern matched)
throw new Error(
`Invalid bare '&' at position ${offset}: ` +
`use & or a valid entity reference (&name;, &#num;, &#xHex;)`,
);
},
);
}
/**
* Encodes special characters as XML entities.
*
* @returns The text with special characters encoded as entities.
*/
export function encodeEntities(text: string): string {
// Fast path: no special characters means nothing to encode
if (!/[<>&'"]/.test(text)) return text;
return text.replace(SPECIAL_CHARS_REGEXP, (c) => ENTITY_MAP[c]!);
}
/**
* Encodes special characters for use in XML attribute values.
* Encodes whitespace characters that would be normalized per XML 1.0 §3.3.3.
*
* @returns The encoded attribute value.
*/
export function encodeAttributeValue(value: string): string {
// Fast path: no special characters means nothing to encode
if (!/[<>&'"\t\n\r]/.test(value)) return value;
return value.replace(ATTR_ENCODE_REGEXP, (c) => ATTR_ENTITY_MAP[c]!);
}
|