207 lines
4.6 KiB
JavaScript
207 lines
4.6 KiB
JavaScript
const { createToken: createTokenOrg, Lexer } = require("chevrotain");
|
|
|
|
// A little mini DSL for easier lexer definition.
|
|
const fragments = {};
|
|
const f = fragments;
|
|
|
|
function FRAGMENT(name, def) {
|
|
fragments[name] = typeof def === "string" ? def : def.source;
|
|
}
|
|
|
|
function makePattern(strings, ...args) {
|
|
let combined = "";
|
|
for (let i = 0; i < strings.length; i++) {
|
|
combined += strings[i];
|
|
if (i < args.length) {
|
|
let pattern = args[i];
|
|
// By wrapping in a RegExp (none) capturing group
|
|
// We enabled the safe usage of qualifiers and assertions.
|
|
combined += `(?:${pattern})`;
|
|
}
|
|
}
|
|
return new RegExp(combined);
|
|
}
|
|
|
|
const tokensArray = [];
|
|
const tokensDictionary = {};
|
|
|
|
function createToken(options) {
|
|
const newTokenType = createTokenOrg(options);
|
|
tokensArray.push(newTokenType);
|
|
tokensDictionary[options.name] = newTokenType;
|
|
return newTokenType;
|
|
}
|
|
|
|
FRAGMENT(
|
|
"NameStartChar",
|
|
"(:|[a-zA-Z]|_|\\u2070-\\u218F|\\u2C00-\\u2FEF|\\u3001-\\uD7FF|\\uF900-\\uFDCF|\\uFDF0-\\uFFFD)"
|
|
);
|
|
|
|
FRAGMENT(
|
|
"NameChar",
|
|
makePattern`${f.NameStartChar}|-|\\.|\\d|\\u00B7||[\\u0300-\\u036F]|[\\u203F-\\u2040]`
|
|
);
|
|
FRAGMENT("Name", makePattern`${f.NameStartChar}(${f.NameChar})*`);
|
|
|
|
const Comment = createToken({
|
|
name: "Comment",
|
|
pattern: /<!--(.|\r?\n)*?-->/,
|
|
// A Comment may span multiple lines.
|
|
line_breaks: true,
|
|
});
|
|
|
|
const CData = createToken({
|
|
name: "CData",
|
|
pattern: /<!\[CDATA\[(.|\r?\n)*?]]>/,
|
|
line_breaks: true,
|
|
});
|
|
|
|
const DocType = createToken({
|
|
name: "DocType",
|
|
pattern: /<!DOCTYPE/,
|
|
push_mode: "INSIDE",
|
|
});
|
|
|
|
const IgnoredDTD = createToken({
|
|
name: "DTD",
|
|
pattern: /<!.*?>/,
|
|
group: Lexer.SKIPPED,
|
|
});
|
|
|
|
const EntityRef = createToken({
|
|
name: "EntityRef",
|
|
pattern: makePattern`&${f.Name};`,
|
|
});
|
|
|
|
const CharRef = createToken({
|
|
name: "CharRef",
|
|
pattern: /&#\d+;|&#x[a-fA-F0-9]/,
|
|
});
|
|
|
|
const SEA_WS = createToken({
|
|
name: "SEA_WS",
|
|
pattern: /( |\t|\n|\r\n)+/,
|
|
});
|
|
|
|
const XMLDeclOpen = createToken({
|
|
name: "XMLDeclOpen",
|
|
pattern: /<\?xml[ \t\r\n]/,
|
|
push_mode: "INSIDE",
|
|
});
|
|
|
|
const SLASH_OPEN = createToken({
|
|
name: "SLASH_OPEN",
|
|
pattern: /<\//,
|
|
push_mode: "INSIDE",
|
|
});
|
|
|
|
const INVALID_SLASH_OPEN = createToken({
|
|
name: "INVALID_SLASH_OPEN",
|
|
pattern: /<\//,
|
|
categories: [SLASH_OPEN],
|
|
});
|
|
|
|
const PROCESSING_INSTRUCTION = createToken({
|
|
name: "PROCESSING_INSTRUCTION",
|
|
pattern: makePattern`<\\?${f.Name}.*\\?>`,
|
|
});
|
|
|
|
const OPEN = createToken({ name: "OPEN", pattern: /</, push_mode: "INSIDE" });
|
|
// Meant to avoid skipping '<' token in a partial sequence of elements.
|
|
// Example of the problem this solves:
|
|
// <
|
|
// <from>john</from>
|
|
// - The second '<' will be skipped because in the mode "INSIDE" '<' is not recognized.
|
|
// - This means the AST will include only a single element instead of two
|
|
const INVALID_OPEN_INSIDE = createToken({
|
|
name: "INVALID_OPEN_INSIDE",
|
|
pattern: /</,
|
|
categories: [OPEN],
|
|
});
|
|
|
|
const TEXT = createToken({ name: "TEXT", pattern: /[^<&]+/ });
|
|
|
|
const CLOSE = createToken({ name: "CLOSE", pattern: />/, pop_mode: true });
|
|
|
|
const SPECIAL_CLOSE = createToken({
|
|
name: "SPECIAL_CLOSE",
|
|
pattern: /\?>/,
|
|
pop_mode: true,
|
|
});
|
|
|
|
const SLASH_CLOSE = createToken({
|
|
name: "SLASH_CLOSE",
|
|
pattern: /\/>/,
|
|
pop_mode: true,
|
|
});
|
|
|
|
const SLASH = createToken({ name: "SLASH", pattern: /\// });
|
|
|
|
const STRING = createToken({
|
|
name: "STRING",
|
|
pattern: /"[^<"]*"|'[^<']*'/,
|
|
});
|
|
|
|
const EQUALS = createToken({ name: "EQUALS", pattern: /=/ });
|
|
|
|
const Name = createToken({ name: "Name", pattern: makePattern`${f.Name}` });
|
|
|
|
const S = createToken({
|
|
name: "S",
|
|
pattern: /[ \t\r\n]/,
|
|
group: Lexer.SKIPPED,
|
|
});
|
|
|
|
const xmlLexerDefinition = {
|
|
defaultMode: "OUTSIDE",
|
|
|
|
modes: {
|
|
OUTSIDE: [
|
|
Comment,
|
|
CData,
|
|
DocType,
|
|
IgnoredDTD,
|
|
EntityRef,
|
|
CharRef,
|
|
SEA_WS,
|
|
XMLDeclOpen,
|
|
SLASH_OPEN,
|
|
PROCESSING_INSTRUCTION,
|
|
OPEN,
|
|
TEXT,
|
|
],
|
|
INSIDE: [
|
|
// Tokens from `OUTSIDE` to improve error recovery behavior
|
|
Comment,
|
|
INVALID_SLASH_OPEN,
|
|
INVALID_OPEN_INSIDE,
|
|
// "Real" `INSIDE` tokens
|
|
CLOSE,
|
|
SPECIAL_CLOSE,
|
|
SLASH_CLOSE,
|
|
SLASH,
|
|
EQUALS,
|
|
STRING,
|
|
Name,
|
|
S,
|
|
],
|
|
},
|
|
};
|
|
|
|
const xmlLexer = new Lexer(xmlLexerDefinition, {
|
|
// Reducing the amount of position tracking can provide a small performance boost (<10%)
|
|
// Likely best to keep the full info for better error position reporting and
|
|
// to expose "fuller" ITokens from the Lexer.
|
|
positionTracking: "full",
|
|
ensureOptimizations: false,
|
|
|
|
// TODO: inspect definitions for XML line terminators
|
|
lineTerminatorCharacters: ["\n"],
|
|
lineTerminatorsPattern: /\n|\r\n/g,
|
|
});
|
|
|
|
module.exports = {
|
|
xmlLexer,
|
|
tokensDictionary,
|
|
};
|