/*! * lunr.Index * Copyright (C) @YEAR Oliver Nightingale */ /** * An index contains the built index of all documents and provides a query interface * to the index. * * Usually instances of lunr.Index will not be created using this constructor, instead * lunr.Builder should be used to construct new indexes, or lunr.Index.load should be * used to load previously built and serialized indexes. * * @constructor * @param {Object} attrs - The attributes of the built search index. * @param {Object} attrs.invertedIndex - An index of term/field to document reference. * @param {Object} attrs.fieldVectors - Field vectors * @param {lunr.TokenSet} attrs.tokenSet - An set of all corpus tokens. * @param {string[]} attrs.fields - The names of indexed document fields. * @param {lunr.Pipeline} attrs.pipeline - The pipeline to use for search terms. */ lunr.Index = function (attrs) { this.invertedIndex = attrs.invertedIndex this.fieldVectors = attrs.fieldVectors this.tokenSet = attrs.tokenSet this.fields = attrs.fields this.pipeline = attrs.pipeline } /** * A result contains details of a document matching a search query. * @typedef {Object} lunr.Index~Result * @property {string} ref - The reference of the document this result represents. * @property {number} score - A number between 0 and 1 representing how similar this document is to the query. * @property {lunr.MatchData} matchData - Contains metadata about this match including which term(s) caused the match. */ /** * Although lunr provides the ability to create queries using lunr.Query, it also provides a simple * query language which itself is parsed into an instance of lunr.Query. * * For programmatically building queries it is advised to directly use lunr.Query, the query language * is best used for human entered text rather than program generated text. * * At its simplest queries can just be a single term, e.g. `hello`, multiple terms are also supported * and will be combined with OR, e.g `hello world` will match documents that contain either 'hello' * or 'world', though those that contain both will rank higher in the results. * * Wildcards can be included in terms to match one or more unspecified characters, these wildcards can * be inserted anywhere within the term, and more than one wildcard can exist in a single term. Adding * wildcards will increase the number of documents that will be found but can also have a negative * impact on query performance, especially with wildcards at the beginning of a term. * * Terms can be restricted to specific fields, e.g. `title:hello`, only documents with the term * hello in the title field will match this query. Using a field not present in the index will lead * to an error being thrown. * * Modifiers can also be added to terms, lunr supports edit distance and boost modifiers on terms. A term * boost will make documents matching that term score higher, e.g. `foo^5`. Edit distance is also supported * to provide fuzzy matching, e.g. 'hello~2' will match documents with hello with an edit distance of 2. * Avoid large values for edit distance to improve query performance. * * Each term also supports a presence modifier. By default a term's presence in document is optional, however * this can be changed to either required or prohibited. For a term's presence to be required in a document the * term should be prefixed with a '+', e.g. `+foo bar` is a search for documents that must contain 'foo' and * optionally contain 'bar'. Conversely a leading '-' sets the terms presence to prohibited, i.e. it must not * appear in a document, e.g. `-foo bar` is a search for documents that do not contain 'foo' but may contain 'bar'. * * To escape special characters the backslash character '\' can be used, this allows searches to include * characters that would normally be considered modifiers, e.g. `foo\~2` will search for a term "foo~2" instead * of attempting to apply a boost of 2 to the search term "foo". * * @typedef {string} lunr.Index~QueryString * @example Simple single term query * hello * @example Multiple term query * hello world * @example term scoped to a field * title:hello * @example term with a boost of 10 * hello^10 * @example term with an edit distance of 2 * hello~2 * @example terms with presence modifiers * -foo +bar baz */ /** * Performs a search against the index using lunr query syntax. * * Results will be returned sorted by their score, the most relevant results * will be returned first. For details on how the score is calculated, please see * the {@link https://lunrjs.com/guides/searching.html#scoring|guide}. * * For more programmatic querying use lunr.Index#query. * * @param {lunr.Index~QueryString} queryString - A string containing a lunr query. * @throws {lunr.QueryParseError} If the passed query string cannot be parsed. * @returns {lunr.Index~Result[]} */ lunr.Index.prototype.search = function (queryString) { return this.query(function (query) { var parser = new lunr.QueryParser(queryString, query) parser.parse() }) } /** * A query builder callback provides a query object to be used to express * the query to perform on the index. * * @callback lunr.Index~queryBuilder * @param {lunr.Query} query - The query object to build up. * @this lunr.Query */ /** * Performs a query against the index using the yielded lunr.Query object. * * If performing programmatic queries against the index, this method is preferred * over lunr.Index#search so as to avoid the additional query parsing overhead. * * A query object is yielded to the supplied function which should be used to * express the query to be run against the index. * * Note that although this function takes a callback parameter it is _not_ an * asynchronous operation, the callback is just yielded a query object to be * customized. * * @param {lunr.Index~queryBuilder} fn - A function that is used to build the query. * @returns {lunr.Index~Result[]} */ lunr.Index.prototype.query = function (fn) { // for each query clause // * process terms // * expand terms from token set // * find matching documents and metadata // * get document vectors // * score documents var query = new lunr.Query(this.fields), matchingFields = Object.create(null), queryVectors = Object.create(null), termFieldCache = Object.create(null), requiredMatches = Object.create(null), prohibitedMatches = Object.create(null) /* * To support field level boosts a query vector is created per * field. An empty vector is eagerly created to support negated * queries. */ for (var i = 0; i < this.fields.length; i++) { queryVectors[this.fields[i]] = new lunr.Vector } fn.call(query, query) for (var i = 0; i < query.clauses.length; i++) { /* * Unless the pipeline has been disabled for this term, which is * the case for terms with wildcards, we need to pass the clause * term through the search pipeline. A pipeline returns an array * of processed terms. Pipeline functions may expand the passed * term, which means we may end up performing multiple index lookups * for a single query term. */ var clause = query.clauses[i], terms = null, clauseMatches = lunr.Set.empty if (clause.usePipeline) { terms = this.pipeline.runString(clause.term, { fields: clause.fields }) } else { terms = [clause.term] } for (var m = 0; m < terms.length; m++) { var term = terms[m] /* * Each term returned from the pipeline needs to use the same query * clause object, e.g. the same boost and or edit distance. The * simplest way to do this is to re-use the clause object but mutate * its term property. */ clause.term = term /* * From the term in the clause we create a token set which will then * be used to intersect the indexes token set to get a list of terms * to lookup in the inverted index */ var termTokenSet = lunr.TokenSet.fromClause(clause), expandedTerms = this.tokenSet.intersect(termTokenSet).toArray() /* * If a term marked as required does not exist in the tokenSet it is * impossible for the search to return any matches. We set all the field * scoped required matches set to empty and stop examining any further * clauses. */ if (expandedTerms.length === 0 && clause.presence === lunr.Query.presence.REQUIRED) { for (var k = 0; k < clause.fields.length; k++) { var field = clause.fields[k] requiredMatches[field] = lunr.Set.empty } break } for (var j = 0; j < expandedTerms.length; j++) { /* * For each term get the posting and termIndex, this is required for * building the query vector. */ var expandedTerm = expandedTerms[j], posting = this.invertedIndex[expandedTerm], termIndex = posting._index for (var k = 0; k < clause.fields.length; k++) { /* * For each field that this query term is scoped by (by default * all fields are in scope) we need to get all the document refs * that have this term in that field. * * The posting is the entry in the invertedIndex for the matching * term from above. */ var field = clause.fields[k], fieldPosting = posting[field], matchingDocumentRefs = Object.keys(fieldPosting), termField = expandedTerm + "/" + field, matchingDocumentsSet = new lunr.Set(matchingDocumentRefs) /* * if the presence of this term is required ensure that the matching * documents are added to the set of required matches for this clause. * */ if (clause.presence == lunr.Query.presence.REQUIRED) { clauseMatches = clauseMatches.union(matchingDocumentsSet) if (requiredMatches[field] === undefined) { requiredMatches[field] = lunr.Set.complete } } /* * if the presence of this term is prohibited ensure that the matching * documents are added to the set of prohibited matches for this field, * creating that set if it does not yet exist. */ if (clause.presence == lunr.Query.presence.PROHIBITED) { if (prohibitedMatches[field] === undefined) { prohibitedMatches[field] = lunr.Set.empty } prohibitedMatches[field] = prohibitedMatches[field].union(matchingDocumentsSet) /* * Prohibited matches should not be part of the query vector used for * similarity scoring and no metadata should be extracted so we continue * to the next field */ continue } /* * The query field vector is populated using the termIndex found for * the term and a unit value with the appropriate boost applied. * Using upsert because there could already be an entry in the vector * for the term we are working with. In that case we just add the scores * together. */ queryVectors[field].upsert(termIndex, clause.boost, function (a, b) { return a + b }) /** * If we've already seen this term, field combo then we've already collected * the matching documents and metadata, no need to go through all that again */ if (termFieldCache[termField]) { continue } for (var l = 0; l < matchingDocumentRefs.length; l++) { /* * All metadata for this term/field/document triple * are then extracted and collected into an instance * of lunr.MatchData ready to be returned in the query * results */ var matchingDocumentRef = matchingDocumentRefs[l], matchingFieldRef = new lunr.FieldRef (matchingDocumentRef, field), metadata = fieldPosting[matchingDocumentRef], fieldMatch if ((fieldMatch = matchingFields[matchingFieldRef]) === undefined) { matchingFields[matchingFieldRef] = new lunr.MatchData (expandedTerm, field, metadata) } else { fieldMatch.add(expandedTerm, field, metadata) } } termFieldCache[termField] = true } } } /** * If the presence was required we need to update the requiredMatches field sets. * We do this after all fields for the term have collected their matches because * the clause terms presence is required in _any_ of the fields not _all_ of the * fields. */ if (clause.presence === lunr.Query.presence.REQUIRED) { for (var k = 0; k < clause.fields.length; k++) { var field = clause.fields[k] requiredMatches[field] = requiredMatches[field].intersect(clauseMatches) } } } /** * Need to combine the field scoped required and prohibited * matching documents into a global set of required and prohibited * matches */ var allRequiredMatches = lunr.Set.complete, allProhibitedMatches = lunr.Set.empty for (var i = 0; i < this.fields.length; i++) { var field = this.fields[i] if (requiredMatches[field]) { allRequiredMatches = allRequiredMatches.intersect(requiredMatches[field]) } if (prohibitedMatches[field]) { allProhibitedMatches = allProhibitedMatches.union(prohibitedMatches[field]) } } var matchingFieldRefs = Object.keys(matchingFields), results = [], matches = Object.create(null) /* * If the query is negated (contains only prohibited terms) * we need to get _all_ fieldRefs currently existing in the * index. This is only done when we know that the query is * entirely prohibited terms to avoid any cost of getting all * fieldRefs unnecessarily. * * Additionally, blank MatchData must be created to correctly * populate the results. */ if (query.isNegated()) { matchingFieldRefs = Object.keys(this.fieldVectors) for (var i = 0; i < matchingFieldRefs.length; i++) { var matchingFieldRef = matchingFieldRefs[i] var fieldRef = lunr.FieldRef.fromString(matchingFieldRef) matchingFields[matchingFieldRef] = new lunr.MatchData } } for (var i = 0; i < matchingFieldRefs.length; i++) { /* * Currently we have document fields that match the query, but we * need to return documents. The matchData and scores are combined * from multiple fields belonging to the same document. * * Scores are calculated by field, using the query vectors created * above, and combined into a final document score using addition. */ var fieldRef = lunr.FieldRef.fromString(matchingFieldRefs[i]), docRef = fieldRef.docRef if (!allRequiredMatches.contains(docRef)) { continue } if (allProhibitedMatches.contains(docRef)) { continue } var fieldVector = this.fieldVectors[fieldRef], score = queryVectors[fieldRef.fieldName].similarity(fieldVector), docMatch if ((docMatch = matches[docRef]) !== undefined) { docMatch.score += score docMatch.matchData.combine(matchingFields[fieldRef]) } else { var match = { ref: docRef, score: score, matchData: matchingFields[fieldRef] } matches[docRef] = match results.push(match) } } /* * Sort the results objects by score, highest first. */ return results.sort(function (a, b) { return b.score - a.score }) } /** * Prepares the index for JSON serialization. * * The schema for this JSON blob will be described in a * separate JSON schema file. * * @returns {Object} */ lunr.Index.prototype.toJSON = function () { var invertedIndex = Object.keys(this.invertedIndex) .sort() .map(function (term) { return [term, this.invertedIndex[term]] }, this) var fieldVectors = Object.keys(this.fieldVectors) .map(function (ref) { return [ref, this.fieldVectors[ref].toJSON()] }, this) return { version: lunr.version, fields: this.fields, fieldVectors: fieldVectors, invertedIndex: invertedIndex, pipeline: this.pipeline.toJSON() } } /** * Loads a previously serialized lunr.Index * * @param {Object} serializedIndex - A previously serialized lunr.Index * @returns {lunr.Index} */ lunr.Index.load = function (serializedIndex) { var attrs = {}, fieldVectors = {}, serializedVectors = serializedIndex.fieldVectors, invertedIndex = Object.create(null), serializedInvertedIndex = serializedIndex.invertedIndex, tokenSetBuilder = new lunr.TokenSet.Builder, pipeline = lunr.Pipeline.load(serializedIndex.pipeline) if (serializedIndex.version != lunr.version) { lunr.utils.warn("Version mismatch when loading serialised index. Current version of lunr '" + lunr.version + "' does not match serialized index '" + serializedIndex.version + "'") } for (var i = 0; i < serializedVectors.length; i++) { var tuple = serializedVectors[i], ref = tuple[0], elements = tuple[1] fieldVectors[ref] = new lunr.Vector(elements) } for (var i = 0; i < serializedInvertedIndex.length; i++) { var tuple = serializedInvertedIndex[i], term = tuple[0], posting = tuple[1] tokenSetBuilder.insert(term) invertedIndex[term] = posting } tokenSetBuilder.finish() attrs.fields = serializedIndex.fields attrs.fieldVectors = fieldVectors attrs.invertedIndex = invertedIndex attrs.tokenSet = tokenSetBuilder.root attrs.pipeline = pipeline return new lunr.Index(attrs) }