securityos/node_modules/lunr/lib/token_set.js

416 lines
11 KiB
JavaScript

/*!
* lunr.TokenSet
* Copyright (C) @YEAR Oliver Nightingale
*/
/**
* A token set is used to store the unique list of all tokens
* within an index. Token sets are also used to represent an
* incoming query to the index, this query token set and index
* token set are then intersected to find which tokens to look
* up in the inverted index.
*
* A token set can hold multiple tokens, as in the case of the
* index token set, or it can hold a single token as in the
* case of a simple query token set.
*
* Additionally token sets are used to perform wildcard matching.
* Leading, contained and trailing wildcards are supported, and
* from this edit distance matching can also be provided.
*
* Token sets are implemented as a minimal finite state automata,
* where both common prefixes and suffixes are shared between tokens.
* This helps to reduce the space used for storing the token set.
*
* @constructor
*/
lunr.TokenSet = function () {
this.final = false
this.edges = {}
this.id = lunr.TokenSet._nextId
lunr.TokenSet._nextId += 1
}
/**
* Keeps track of the next, auto increment, identifier to assign
* to a new tokenSet.
*
* TokenSets require a unique identifier to be correctly minimised.
*
* @private
*/
lunr.TokenSet._nextId = 1
/**
* Creates a TokenSet instance from the given sorted array of words.
*
* @param {String[]} arr - A sorted array of strings to create the set from.
* @returns {lunr.TokenSet}
* @throws Will throw an error if the input array is not sorted.
*/
lunr.TokenSet.fromArray = function (arr) {
var builder = new lunr.TokenSet.Builder
for (var i = 0, len = arr.length; i < len; i++) {
builder.insert(arr[i])
}
builder.finish()
return builder.root
}
/**
* Creates a token set from a query clause.
*
* @private
* @param {Object} clause - A single clause from lunr.Query.
* @param {string} clause.term - The query clause term.
* @param {number} [clause.editDistance] - The optional edit distance for the term.
* @returns {lunr.TokenSet}
*/
lunr.TokenSet.fromClause = function (clause) {
if ('editDistance' in clause) {
return lunr.TokenSet.fromFuzzyString(clause.term, clause.editDistance)
} else {
return lunr.TokenSet.fromString(clause.term)
}
}
/**
* Creates a token set representing a single string with a specified
* edit distance.
*
* Insertions, deletions, substitutions and transpositions are each
* treated as an edit distance of 1.
*
* Increasing the allowed edit distance will have a dramatic impact
* on the performance of both creating and intersecting these TokenSets.
* It is advised to keep the edit distance less than 3.
*
* @param {string} str - The string to create the token set from.
* @param {number} editDistance - The allowed edit distance to match.
* @returns {lunr.Vector}
*/
lunr.TokenSet.fromFuzzyString = function (str, editDistance) {
var root = new lunr.TokenSet
var stack = [{
node: root,
editsRemaining: editDistance,
str: str
}]
while (stack.length) {
var frame = stack.pop()
// no edit
if (frame.str.length > 0) {
var char = frame.str.charAt(0),
noEditNode
if (char in frame.node.edges) {
noEditNode = frame.node.edges[char]
} else {
noEditNode = new lunr.TokenSet
frame.node.edges[char] = noEditNode
}
if (frame.str.length == 1) {
noEditNode.final = true
}
stack.push({
node: noEditNode,
editsRemaining: frame.editsRemaining,
str: frame.str.slice(1)
})
}
if (frame.editsRemaining == 0) {
continue
}
// insertion
if ("*" in frame.node.edges) {
var insertionNode = frame.node.edges["*"]
} else {
var insertionNode = new lunr.TokenSet
frame.node.edges["*"] = insertionNode
}
if (frame.str.length == 0) {
insertionNode.final = true
}
stack.push({
node: insertionNode,
editsRemaining: frame.editsRemaining - 1,
str: frame.str
})
// deletion
// can only do a deletion if we have enough edits remaining
// and if there are characters left to delete in the string
if (frame.str.length > 1) {
stack.push({
node: frame.node,
editsRemaining: frame.editsRemaining - 1,
str: frame.str.slice(1)
})
}
// deletion
// just removing the last character from the str
if (frame.str.length == 1) {
frame.node.final = true
}
// substitution
// can only do a substitution if we have enough edits remaining
// and if there are characters left to substitute
if (frame.str.length >= 1) {
if ("*" in frame.node.edges) {
var substitutionNode = frame.node.edges["*"]
} else {
var substitutionNode = new lunr.TokenSet
frame.node.edges["*"] = substitutionNode
}
if (frame.str.length == 1) {
substitutionNode.final = true
}
stack.push({
node: substitutionNode,
editsRemaining: frame.editsRemaining - 1,
str: frame.str.slice(1)
})
}
// transposition
// can only do a transposition if there are edits remaining
// and there are enough characters to transpose
if (frame.str.length > 1) {
var charA = frame.str.charAt(0),
charB = frame.str.charAt(1),
transposeNode
if (charB in frame.node.edges) {
transposeNode = frame.node.edges[charB]
} else {
transposeNode = new lunr.TokenSet
frame.node.edges[charB] = transposeNode
}
if (frame.str.length == 1) {
transposeNode.final = true
}
stack.push({
node: transposeNode,
editsRemaining: frame.editsRemaining - 1,
str: charA + frame.str.slice(2)
})
}
}
return root
}
/**
* Creates a TokenSet from a string.
*
* The string may contain one or more wildcard characters (*)
* that will allow wildcard matching when intersecting with
* another TokenSet.
*
* @param {string} str - The string to create a TokenSet from.
* @returns {lunr.TokenSet}
*/
lunr.TokenSet.fromString = function (str) {
var node = new lunr.TokenSet,
root = node
/*
* Iterates through all characters within the passed string
* appending a node for each character.
*
* When a wildcard character is found then a self
* referencing edge is introduced to continually match
* any number of any characters.
*/
for (var i = 0, len = str.length; i < len; i++) {
var char = str[i],
final = (i == len - 1)
if (char == "*") {
node.edges[char] = node
node.final = final
} else {
var next = new lunr.TokenSet
next.final = final
node.edges[char] = next
node = next
}
}
return root
}
/**
* Converts this TokenSet into an array of strings
* contained within the TokenSet.
*
* This is not intended to be used on a TokenSet that
* contains wildcards, in these cases the results are
* undefined and are likely to cause an infinite loop.
*
* @returns {string[]}
*/
lunr.TokenSet.prototype.toArray = function () {
var words = []
var stack = [{
prefix: "",
node: this
}]
while (stack.length) {
var frame = stack.pop(),
edges = Object.keys(frame.node.edges),
len = edges.length
if (frame.node.final) {
/* In Safari, at this point the prefix is sometimes corrupted, see:
* https://github.com/olivernn/lunr.js/issues/279 Calling any
* String.prototype method forces Safari to "cast" this string to what
* it's supposed to be, fixing the bug. */
frame.prefix.charAt(0)
words.push(frame.prefix)
}
for (var i = 0; i < len; i++) {
var edge = edges[i]
stack.push({
prefix: frame.prefix.concat(edge),
node: frame.node.edges[edge]
})
}
}
return words
}
/**
* Generates a string representation of a TokenSet.
*
* This is intended to allow TokenSets to be used as keys
* in objects, largely to aid the construction and minimisation
* of a TokenSet. As such it is not designed to be a human
* friendly representation of the TokenSet.
*
* @returns {string}
*/
lunr.TokenSet.prototype.toString = function () {
// NOTE: Using Object.keys here as this.edges is very likely
// to enter 'hash-mode' with many keys being added
//
// avoiding a for-in loop here as it leads to the function
// being de-optimised (at least in V8). From some simple
// benchmarks the performance is comparable, but allowing
// V8 to optimize may mean easy performance wins in the future.
if (this._str) {
return this._str
}
var str = this.final ? '1' : '0',
labels = Object.keys(this.edges).sort(),
len = labels.length
for (var i = 0; i < len; i++) {
var label = labels[i],
node = this.edges[label]
str = str + label + node.id
}
return str
}
/**
* Returns a new TokenSet that is the intersection of
* this TokenSet and the passed TokenSet.
*
* This intersection will take into account any wildcards
* contained within the TokenSet.
*
* @param {lunr.TokenSet} b - An other TokenSet to intersect with.
* @returns {lunr.TokenSet}
*/
lunr.TokenSet.prototype.intersect = function (b) {
var output = new lunr.TokenSet,
frame = undefined
var stack = [{
qNode: b,
output: output,
node: this
}]
while (stack.length) {
frame = stack.pop()
// NOTE: As with the #toString method, we are using
// Object.keys and a for loop instead of a for-in loop
// as both of these objects enter 'hash' mode, causing
// the function to be de-optimised in V8
var qEdges = Object.keys(frame.qNode.edges),
qLen = qEdges.length,
nEdges = Object.keys(frame.node.edges),
nLen = nEdges.length
for (var q = 0; q < qLen; q++) {
var qEdge = qEdges[q]
for (var n = 0; n < nLen; n++) {
var nEdge = nEdges[n]
if (nEdge == qEdge || qEdge == '*') {
var node = frame.node.edges[nEdge],
qNode = frame.qNode.edges[qEdge],
final = node.final && qNode.final,
next = undefined
if (nEdge in frame.output.edges) {
// an edge already exists for this character
// no need to create a new node, just set the finality
// bit unless this node is already final
next = frame.output.edges[nEdge]
next.final = next.final || final
} else {
// no edge exists yet, must create one
// set the finality bit and insert it
// into the output
next = new lunr.TokenSet
next.final = final
frame.output.edges[nEdge] = next
}
stack.push({
qNode: qNode,
output: next,
node: node
})
}
}
}
}
return output
}