Source code for lupyne.engine.documents

"""
Wrappers for lucene Fields and Documents.
"""

from future_builtins import map, zip
import datetime, calendar
import collections
import lucene
from .queries import Query

[docs]class Field(object): """Saved parameters which can generate lucene Fields given values. :param name: name of field :param store,index,termvector: field parameters, expressed as bools or strs, with lucene defaults :param analyzed,omitNorms: additional index boolean settings :param withPositions,withOffsets: additional termvector boolean settings :param attrs: additional attributes to set on the field """ def __init__(self, name, store=False, index='analyzed', termvector=False, analyzed=False, omitNorms=False, withPositions=False, withOffsets=False, **attrs): self.name, self.attrs = name, attrs if isinstance(store, bool): store = 'yes' if store else 'no' self.store = lucene.Field.Store.valueOf(store.upper()) if isinstance(index, bool): self.index = lucene.Field.Index.toIndex(index, analyzed, omitNorms) else: self.index = lucene.Field.Index.valueOf(index.upper()) if isinstance(termvector, bool): self.termvector = lucene.Field.TermVector.toTermVector(termvector, withOffsets, withPositions) else: self.termvector = lucene.Field.TermVector.valueOf(termvector.upper()) next(Field.items(self, ' ')) # validate settings
[docs] def items(self, *values): "Generate lucene Fields suitable for adding to a document." for value in values: if isinstance(value, basestring): field = lucene.Field(self.name, value, self.store, self.index, self.termvector) elif isinstance(value, lucene.JArray_byte): field = lucene.Field(self.name, value) else: field = lucene.Field(self.name, value, self.termvector) for name, value in self.attrs.items(): setattr(field, name, value) yield field
[docs]class FormatField(Field): """Field which uses string formatting on its values. :param format: format string """ def __init__(self, name, format='{0}', **kwargs): Field.__init__(self, name, **kwargs) self.format = format.format
[docs] def items(self, *values): "Generate fields with formatted values." return Field.items(self, *map(self.format, values))
[docs]class NestedField(Field): """Field which indexes every component into its own field. Original value may be stored for convenience. :param sep: field separator used on name and values """ def __init__(self, name, sep='.', index=True, **kwargs): Field.__init__(self, name, index=index, **kwargs) self.sep = sep self.names = tuple(self.values(name))
[docs] def values(self, value): "Generate component field values in order." value = value.split(self.sep) for index in range(1, len(value) + 1): yield self.sep.join(value[:index])
[docs] def items(self, *values): "Generate indexed component fields." if self.store.stored: for value in values: yield lucene.Field(self.name, value, self.store, lucene.Field.Index.NO) for value in values: for index, text in enumerate(self.values(value)): yield lucene.Field(self.names[index], text, lucene.Field.Store.NO, self.index, self.termvector)
[docs] def prefix(self, value): "Return prefix query of the closest possible prefixed field." index = value.count(self.sep) return Query.prefix(self.names[index], value)
[docs] def range(self, start, stop, lower=True, upper=False): "Return range query of the closest possible prefixed field." index = max(value.count(self.sep) for value in (start, stop) if value is not None) return Query.range(self.names[index], start, stop, lower, upper)
[docs]class NumericField(Field): """Field which indexes numbers in a prefix tree. :param name: name of field :param step: precision step """ def __init__(self, name, step=None, store=False, index=True): Field.__init__(self, name, store) self.step = step or lucene.NumericUtils.PRECISION_STEP_DEFAULT self.index = index
[docs] def items(self, *values): "Generate lucene NumericFields suitable for adding to a document." for value in values: field = lucene.NumericField(self.name, self.step, self.store, self.index) if isinstance(value, float): field.doubleValue = value else: field.longValue = long(value) yield field
def numeric(self, cls, start, stop, lower, upper): if isinstance(start, float) or isinstance(stop, float): start, stop = (value if value is None else lucene.Double(value) for value in (start, stop)) return cls.newDoubleRange(self.name, self.step, start, stop, lower, upper) if start is not None: start = None if start < lucene.Long.MIN_VALUE else lucene.Long(long(start)) if stop is not None: stop = None if stop > lucene.Long.MAX_VALUE else lucene.Long(long(stop)) return cls.newLongRange(self.name, self.step, start, stop, lower, upper)
[docs] def range(self, start, stop, lower=True, upper=False): "Return lucene NumericRangeQuery." return self.numeric(lucene.NumericRangeQuery, start, stop, lower, upper)
[docs] def term(self, value): "Return range query to match single term." return self.range(value, value, upper=True)
[docs] def filter(self, start, stop, lower=True, upper=False): "Return lucene NumericRangeFilter." return self.numeric(lucene.NumericRangeFilter, start, stop, lower, upper)
[docs]class DateTimeField(NumericField): """Field which indexes datetimes as a NumericField of timestamps. Supports datetimes, dates, and any prefix of time tuples. """
[docs] def timestamp(self, date): "Return utc timestamp from date or time tuple." if isinstance(date, datetime.date): return calendar.timegm(date.timetuple()) + getattr(date, 'microsecond', 0) * 1e-6 return float(calendar.timegm(tuple(date) + (None, 1, 1, 0, 0, 0)[len(date):]))
[docs] def items(self, *dates): "Generate lucene NumericFields of timestamps." return NumericField.items(self, *map(self.timestamp, dates))
[docs] def range(self, start, stop, lower=True, upper=False): "Return NumericRangeQuery of timestamps." start, stop = (date and self.timestamp(date) for date in (start, stop)) return NumericField.range(self, start, stop, lower, upper)
[docs] def prefix(self, date): "Return range query which matches the date prefix." if isinstance(date, datetime.date): date = date.timetuple()[:6 if isinstance(date, datetime.datetime) else 3] if len(date) == 2 and date[1] == 12: # month must be valid return self.range(date, (date[0]+1, 1)) return self.range(date, tuple(date[:-1]) + (date[-1]+1,))
[docs] def duration(self, date, days=0, **delta): """Return date range query within time span of date. :param date: origin date or tuple :param days,delta: timedelta parameters """ if not isinstance(date, datetime.date): date = datetime.datetime(*(tuple(date) + (None, 1, 1)[len(date):])) delta = datetime.timedelta(days, **delta) return self.range(*sorted([date, date + delta]), upper=True)
[docs] def within(self, days=0, weeks=0, utc=True, **delta): """Return date range query within current time and delta. If the delta is an exact number of days, then dates will be used. :param days,weeks: number of days to offset from today :param utc: optionally use utc instead of local time :param delta: additional timedelta parameters """ date = datetime.datetime.utcnow() if utc else datetime.datetime.now() if not (isinstance(days + weeks, float) or delta): date = date.date() return self.duration(date, days, weeks=weeks, **delta)
[docs]class Document(dict): "Multimapping of field names to values, but default getters return the first value." def __init__(self, doc): for field in doc.getFields(): self.setdefault(field.name(), []).append(field.binaryValue.string_ if field.binary else field.stringValue()) def __getitem__(self, name): return dict.__getitem__(self, name)[0] def get(self, name, default=None): return dict.get(self, name, [default])[0]
[docs] def getlist(self, name): "Return list of all values for given field." return dict.get(self, name, [])
[docs] def dict(self, *names, **defaults): """Return dict representation of document. :param names: names of multi-valued fields to return as a list :param defaults: include only given fields, using default values as necessary """ defaults.update((name, self[name]) for name in (defaults or self) if name in self) defaults.update((name, self.getlist(name)) for name in names) return defaults
[docs]class Hit(Document): "A Document with an id and score, from a search result." def __init__(self, doc, id, score): Document.__init__(self, doc) self.id, self.score = id, score
[docs] def dict(self, *names, **defaults): "Return dict representation of document with __id__ and __score__." result = Document.dict(self, *names, **defaults) result.update(__id__=self.id, __score__=self.score) return result
[docs]class Hits(object): """Search results: lazily evaluated and memory efficient. Provides a read-only sequence interface to hit objects. :param searcher: `IndexSearcher`_ which can retrieve documents :param ids: ordered doc ids :param scores: ordered doc scores :param count: total number of hits :param maxscore: maximum score :param fields: optional field selectors """ def __init__(self, searcher, ids, scores, count=None, maxscore=None, fields=None): self.searcher = searcher self.ids, self.scores = ids, scores self.count, self.maxscore = count, maxscore self.fields = lucene.MapFieldSelector(fields) if isinstance(fields, collections.Iterable) else fields
[docs] def __len__(self): return len(self.ids)
[docs] def __getitem__(self, index): id, score = self.ids[index], self.scores[index] if isinstance(index, slice): return type(self)(self.searcher, id, score, self.count, self.maxscore, self.fields) return Hit(self.searcher.doc(id, self.fields), id, score)
[docs] def items(self): "Generate zipped ids and scores." return zip(self.ids, self.scores)
[docs] def groupby(self, func): "Return ordered list of `Hits`_ grouped by value of function applied to doc ids." groups = {} for id, score in self.items(): value = func(id) try: group = groups[value] except KeyError: group = groups[value] = type(self)(self.searcher, [], [], fields=self.fields) group.index, group.value = len(groups), value group.ids.append(id) group.scores.append(score) return sorted(groups.values(), key=lambda group: group.__dict__.pop('index'))
[docs] def filter(self, func): "Return `Hits`_ filtered by function applied to doc ids." ids, scores = [], [] for id, score in self.items(): if func(id): ids.append(id) scores.append(score) return type(self)(self.searcher, ids, scores, fields=self.fields)
[docs] def sorted(self, key, reverse=False): "Return `Hits`_ sorted by key function applied to doc ids." ids = sorted(self.ids, key=key, reverse=reverse) scores = list(map(dict(self.items()).__getitem__, ids)) return type(self)(self.searcher, ids, scores, self.count, self.maxscore, self.fields)