/*! * lunr.Builder * Copyright (C) @YEAR Oliver Nightingale */ /** * lunr.Builder performs indexing on a set of documents and * returns instances of lunr.Index ready for querying. * * All configuration of the index is done via the builder, the * fields to index, the document reference, the text processing * pipeline and document scoring parameters are all set on the * builder before indexing. * * @constructor * @property {string} _ref - Internal reference to the document reference field. * @property {string[]} _fields - Internal reference to the document fields to index. * @property {object} invertedIndex - The inverted index maps terms to document fields. * @property {object} documentTermFrequencies - Keeps track of document term frequencies. * @property {object} documentLengths - Keeps track of the length of documents added to the index. * @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing. * @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing. * @property {lunr.Pipeline} searchPipeline - A pipeline for processing search terms before querying the index. * @property {number} documentCount - Keeps track of the total number of documents indexed. * @property {number} _b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75. * @property {number} _k1 - A parameter to control how quickly an increase in term frequency results in term frequency saturation, the default value is 1.2. * @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space. * @property {array} metadataWhitelist - A list of metadata keys that have been whitelisted for entry in the index. */ lunr.Builder = function () { this._ref = "id" this._fields = Object.create(null) this._documents = Object.create(null) this.invertedIndex = Object.create(null) this.fieldTermFrequencies = {} this.fieldLengths = {} this.tokenizer = lunr.tokenizer this.pipeline = new lunr.Pipeline this.searchPipeline = new lunr.Pipeline this.documentCount = 0 this._b = 0.75 this._k1 = 1.2 this.termIndex = 0 this.metadataWhitelist = [] } /** * Sets the document field used as the document reference. Every document must have this field. * The type of this field in the document should be a string, if it is not a string it will be * coerced into a string by calling toString. * * The default ref is 'id'. * * The ref should _not_ be changed during indexing, it should be set before any documents are * added to the index. Changing it during indexing can lead to inconsistent results. * * @param {string} ref - The name of the reference field in the document. */ lunr.Builder.prototype.ref = function (ref) { this._ref = ref } /** * A function that is used to extract a field from a document. * * Lunr expects a field to be at the top level of a document, if however the field * is deeply nested within a document an extractor function can be used to extract * the right field for indexing. * * @callback fieldExtractor * @param {object} doc - The document being added to the index. * @returns {?(string|object|object[])} obj - The object that will be indexed for this field. * @example Extracting a nested field * function (doc) { return doc.nested.field } */ /** * Adds a field to the list of document fields that will be indexed. Every document being * indexed should have this field. Null values for this field in indexed documents will * not cause errors but will limit the chance of that document being retrieved by searches. * * All fields should be added before adding documents to the index. Adding fields after * a document has been indexed will have no effect on already indexed documents. * * Fields can be boosted at build time. This allows terms within that field to have more * importance when ranking search results. Use a field boost to specify that matches within * one field are more important than other fields. * * @param {string} fieldName - The name of a field to index in all documents. * @param {object} attributes - Optional attributes associated with this field. * @param {number} [attributes.boost=1] - Boost applied to all terms within this field. * @param {fieldExtractor} [attributes.extractor] - Function to extract a field from a document. * @throws {RangeError} fieldName cannot contain unsupported characters '/' */ lunr.Builder.prototype.field = function (fieldName, attributes) { if (/\//.test(fieldName)) { throw new RangeError ("Field '" + fieldName + "' contains illegal character '/'") } this._fields[fieldName] = attributes || {} } /** * A parameter to tune the amount of field length normalisation that is applied when * calculating relevance scores. A value of 0 will completely disable any normalisation * and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b * will be clamped to the range 0 - 1. * * @param {number} number - The value to set for this tuning parameter. */ lunr.Builder.prototype.b = function (number) { if (number < 0) { this._b = 0 } else if (number > 1) { this._b = 1 } else { this._b = number } } /** * A parameter that controls the speed at which a rise in term frequency results in term * frequency saturation. The default value is 1.2. Setting this to a higher value will give * slower saturation levels, a lower value will result in quicker saturation. * * @param {number} number - The value to set for this tuning parameter. */ lunr.Builder.prototype.k1 = function (number) { this._k1 = number } /** * Adds a document to the index. * * Before adding fields to the index the index should have been fully setup, with the document * ref and all fields to index already having been specified. * * The document must have a field name as specified by the ref (by default this is 'id') and * it should have all fields defined for indexing, though null or undefined values will not * cause errors. * * Entire documents can be boosted at build time. Applying a boost to a document indicates that * this document should rank higher in search results than other documents. * * @param {object} doc - The document to add to the index. * @param {object} attributes - Optional attributes associated with this document. * @param {number} [attributes.boost=1] - Boost applied to all terms within this document. */ lunr.Builder.prototype.add = function (doc, attributes) { var docRef = doc[this._ref], fields = Object.keys(this._fields) this._documents[docRef] = attributes || {} this.documentCount += 1 for (var i = 0; i < fields.length; i++) { var fieldName = fields[i], extractor = this._fields[fieldName].extractor, field = extractor ? extractor(doc) : doc[fieldName], tokens = this.tokenizer(field, { fields: [fieldName] }), terms = this.pipeline.run(tokens), fieldRef = new lunr.FieldRef (docRef, fieldName), fieldTerms = Object.create(null) this.fieldTermFrequencies[fieldRef] = fieldTerms this.fieldLengths[fieldRef] = 0 // store the length of this field for this document this.fieldLengths[fieldRef] += terms.length // calculate term frequencies for this field for (var j = 0; j < terms.length; j++) { var term = terms[j] if (fieldTerms[term] == undefined) { fieldTerms[term] = 0 } fieldTerms[term] += 1 // add to inverted index // create an initial posting if one doesn't exist if (this.invertedIndex[term] == undefined) { var posting = Object.create(null) posting["_index"] = this.termIndex this.termIndex += 1 for (var k = 0; k < fields.length; k++) { posting[fields[k]] = Object.create(null) } this.invertedIndex[term] = posting } // add an entry for this term/fieldName/docRef to the invertedIndex if (this.invertedIndex[term][fieldName][docRef] == undefined) { this.invertedIndex[term][fieldName][docRef] = Object.create(null) } // store all whitelisted metadata about this token in the // inverted index for (var l = 0; l < this.metadataWhitelist.length; l++) { var metadataKey = this.metadataWhitelist[l], metadata = term.metadata[metadataKey] if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) { this.invertedIndex[term][fieldName][docRef][metadataKey] = [] } this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata) } } } } /** * Calculates the average document length for this index * * @private */ lunr.Builder.prototype.calculateAverageFieldLengths = function () { var fieldRefs = Object.keys(this.fieldLengths), numberOfFields = fieldRefs.length, accumulator = {}, documentsWithField = {} for (var i = 0; i < numberOfFields; i++) { var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]), field = fieldRef.fieldName documentsWithField[field] || (documentsWithField[field] = 0) documentsWithField[field] += 1 accumulator[field] || (accumulator[field] = 0) accumulator[field] += this.fieldLengths[fieldRef] } var fields = Object.keys(this._fields) for (var i = 0; i < fields.length; i++) { var fieldName = fields[i] accumulator[fieldName] = accumulator[fieldName] / documentsWithField[fieldName] } this.averageFieldLength = accumulator } /** * Builds a vector space model of every document using lunr.Vector * * @private */ lunr.Builder.prototype.createFieldVectors = function () { var fieldVectors = {}, fieldRefs = Object.keys(this.fieldTermFrequencies), fieldRefsLength = fieldRefs.length, termIdfCache = Object.create(null) for (var i = 0; i < fieldRefsLength; i++) { var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]), fieldName = fieldRef.fieldName, fieldLength = this.fieldLengths[fieldRef], fieldVector = new lunr.Vector, termFrequencies = this.fieldTermFrequencies[fieldRef], terms = Object.keys(termFrequencies), termsLength = terms.length var fieldBoost = this._fields[fieldName].boost || 1, docBoost = this._documents[fieldRef.docRef].boost || 1 for (var j = 0; j < termsLength; j++) { var term = terms[j], tf = termFrequencies[term], termIndex = this.invertedIndex[term]._index, idf, score, scoreWithPrecision if (termIdfCache[term] === undefined) { idf = lunr.idf(this.invertedIndex[term], this.documentCount) termIdfCache[term] = idf } else { idf = termIdfCache[term] } score = idf * ((this._k1 + 1) * tf) / (this._k1 * (1 - this._b + this._b * (fieldLength / this.averageFieldLength[fieldName])) + tf) score *= fieldBoost score *= docBoost scoreWithPrecision = Math.round(score * 1000) / 1000 // Converts 1.23456789 to 1.234. // Reducing the precision so that the vectors take up less // space when serialised. Doing it now so that they behave // the same before and after serialisation. Also, this is // the fastest approach to reducing a number's precision in // JavaScript. fieldVector.insert(termIndex, scoreWithPrecision) } fieldVectors[fieldRef] = fieldVector } this.fieldVectors = fieldVectors } /** * Creates a token set of all tokens in the index using lunr.TokenSet * * @private */ lunr.Builder.prototype.createTokenSet = function () { this.tokenSet = lunr.TokenSet.fromArray( Object.keys(this.invertedIndex).sort() ) } /** * Builds the index, creating an instance of lunr.Index. * * This completes the indexing process and should only be called * once all documents have been added to the index. * * @returns {lunr.Index} */ lunr.Builder.prototype.build = function () { this.calculateAverageFieldLengths() this.createFieldVectors() this.createTokenSet() return new lunr.Index({ invertedIndex: this.invertedIndex, fieldVectors: this.fieldVectors, tokenSet: this.tokenSet, fields: Object.keys(this._fields), pipeline: this.searchPipeline }) } /** * Applies a plugin to the index builder. * * A plugin is a function that is called with the index builder as its context. * Plugins can be used to customise or extend the behaviour of the index * in some way. A plugin is just a function, that encapsulated the custom * behaviour that should be applied when building the index. * * The plugin function will be called with the index builder as its argument, additional * arguments can also be passed when calling use. The function will be called * with the index builder as its context. * * @param {Function} plugin The plugin to apply. */ lunr.Builder.prototype.use = function (fn) { var args = Array.prototype.slice.call(arguments, 1) args.unshift(this) fn.apply(this, args) }