All files / xml / _name_chars.ts

100.00% Branches 71/71
100.00% Lines 55/55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
x10
 
 
 
 
 
x10
 
 
x10
x1290
 
 
x1290
x1290
x1290
x1290
 
x1290
x1290
 
 
x1290
x1290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
x10
 
x73
 
 
x73
 
 
x73
 
 
x73
 
 
x73
x79
x79
 
 
x73
 
 
x73
 
 
x73
 
x89
x73
 
 
 
 
 
 
 
x10
 
x29
 
 
x29
 
 
x29
 
x45
x29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
x10
 
x169
x274
x274
x274
x274
 
 
x169
x179
x179
 
 
x213
x169
 
 
 
 
 
 
 
 
 
 
 
 
x10
 
x84
x135
x135
x135
x135
x135
x135
 
 
x84
x88
x88
 
 
x103
x84















































































































































































// Copyright 2018-2026 the Deno authors. MIT license.
// This module is browser compatible.

/**
 * XML 1.0 Fifth Edition name character validation.
 *
 * Provides optimized validation for NameStartChar and NameChar per the XML spec:
 * https://www.w3.org/TR/xml/#NT-NameStartChar
 *
 * Performance strategy:
 * 1. Inline ASCII checks (99%+ of real-world XML) - no memory access
 * 2. Lookup table for Latin-1 Supplement (0x80-0xFF) - 128 bytes
 * 3. Ordered range checks for Unicode (0x100+) - most common ranges first
 *
 * @module
 */

// =============================================================================
// LOOKUP TABLES FOR LATIN-1 SUPPLEMENT (0x80-0xFF)
// =============================================================================

/**
 * Pre-computed lookup table for Latin-1 Supplement NameStartChar (0x80-0xFF).
 * Valid: [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#xFF]
 * Invalid: 0x80-0xBF, 0xD7 (×), 0xF7 (÷)
 */
const LATIN1_NAME_START = new Uint8Array(128);

/**
 * Pre-computed lookup table for Latin-1 Supplement NameChar (0x80-0xFF).
 * Same as NameStartChar plus 0xB7 (·)
 */
const LATIN1_NAME_CHAR = new Uint8Array(128);

// Initialize Latin-1 tables at module load
for (let i = 0; i < 128; i++) {
  const code = i + 0x80;

  // NameStartChar: [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#xFF]
  LATIN1_NAME_START[i] = (
      (code >= 0xC0 && code <= 0xD6) ||
      (code >= 0xD8 && code <= 0xF6) ||
      code >= 0xF8
    )
    ? 1
    : 0;

  // NameChar: NameStartChar + 0xB7
  LATIN1_NAME_CHAR[i] = (LATIN1_NAME_START[i] === 1 || code === 0xB7) ? 1 : 0;
}

// =============================================================================
// UNICODE RANGE VALIDATION (code >= 0x100)
// =============================================================================

/**
 * Check if a Unicode code point (>= 0x100) is a valid NameStartChar.
 * Ranges ordered by frequency in real-world XML (CJK is common).
 *
 * XML 1.0 Fifth Edition NameStartChar (non-ASCII portion):
 *   [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
 *   [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] |
 *   [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
 */
function isUnicodeNameStartChar(code: number): boolean {
  // [#x3001-#xD7FF] - CJK, Hangul, etc. (most common non-ASCII in real XML)
  if (code >= 0x3001 && code <= 0xD7FF) return true;

  // [#x100-#x2FF] - Latin Extended-A/B, IPA Extensions, Spacing Modifiers
  if (code >= 0x100 && code <= 0x2FF) return true;

  // [#x370-#x37D] | [#x37F-#x1FFF] - Greek, Cyrillic, Armenian, Hebrew, Arabic, etc.
  if (code >= 0x370 && code <= 0x1FFF) return code !== 0x37E;

  // [#x2C00-#x2FEF] - Glagolitic, Latin Extended-C/D, Coptic, Georgian Supplement
  if (code >= 0x2C00 && code <= 0x2FEF) return true;

  // [#xF900-#xFDCF] | [#xFDF0-#xFFFD] - CJK Compatibility, Arabic Presentation Forms
  if (code >= 0xF900 && code <= 0xFFFD) {
    return code <= 0xFDCF || code >= 0xFDF0;
  }

  // [#x2070-#x218F] - Superscripts, Currency, Letterlike, Number Forms
  if (code >= 0x2070 && code <= 0x218F) return true;

  // [#x200C-#x200D] - Zero-width non-joiner/joiner
  if (code === 0x200C || code === 0x200D) return true;

  // [#x10000-#xEFFFF] - Astral planes (SMP, SIP, TIP)
  if (code >= 0x10000 && code <= 0xEFFFF) return true;

  return false;
}

/**
 * Check if a Unicode code point (>= 0x100) is a valid NameChar.
 *
 * Additional NameChar code points beyond NameStartChar:
 *   #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
 */
function isUnicodeNameChar(code: number): boolean {
  // NameChar includes all of NameStartChar
  if (isUnicodeNameStartChar(code)) return true;

  // [#x0300-#x036F] - Combining Diacritical Marks
  if (code >= 0x0300 && code <= 0x036F) return true;

  // [#x203F-#x2040] - Undertie, Character Tie
  if (code === 0x203F || code === 0x2040) return true;

  return false;
}

// =============================================================================
// MAIN VALIDATION FUNCTIONS
// =============================================================================

/**
 * Check if a code point is a valid XML NameStartChar.
 *
 * NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] |
 *                   [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
 *                   [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
 *                   [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
 *                   [#x10000-#xEFFFF]
 *
 * Optimized for the common case (ASCII letters, underscore, colon).
 *
 * @param code The Unicode code point to check
 * @returns true if valid NameStartChar, false otherwise
 */
export function isNameStartChar(code: number): boolean {
  // Fast ASCII path (handles 99%+ of real-world XML)
  if (code < 0x80) {
    return (code >= 0x61 && code <= 0x7A) || // a-z (most common)
      (code >= 0x41 && code <= 0x5A) || // A-Z
      code === 0x5F || code === 0x3A; // _ :
  }

  // Latin-1 Supplement (0x80-0xFF): use lookup table
  if (code < 0x100) {
    return LATIN1_NAME_START[code - 0x80] === 1;
  }

  // Unicode (>= 0x100): use range checks
  return isUnicodeNameStartChar(code);
}

/**
 * Check if a code point is a valid XML NameChar.
 *
 * NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
 *              [#x0300-#x036F] | [#x203F-#x2040]
 *
 * Optimized for the common case (ASCII alphanumerics, underscore, colon, hyphen, dot).
 *
 * @param code The Unicode code point to check
 * @returns true if valid NameChar, false otherwise
 */
export function isNameChar(code: number): boolean {
  // Fast ASCII path (handles 99%+ of real-world XML)
  if (code < 0x80) {
    return (code >= 0x61 && code <= 0x7A) || // a-z (most common)
      (code >= 0x41 && code <= 0x5A) || // A-Z
      (code >= 0x30 && code <= 0x39) || // 0-9
      code === 0x5F || code === 0x3A || // _ :
      code === 0x2D || code === 0x2E; // - .
  }

  // Latin-1 Supplement (0x80-0xFF): use lookup table
  if (code < 0x100) {
    return LATIN1_NAME_CHAR[code - 0x80] === 1;
  }

  // Unicode (>= 0x100): use range checks
  return isUnicodeNameChar(code);
}