examples

Contrasting examples of the lucene api versus pythonic lupyne idioms.

indexers

"""
Basic indexing and searching example adapted from http://lucene.apache.org/java/3_1_0/api/core/index.html
"""

import lucene
lucene.initVM()
from lupyne import engine

### lucene ###

analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)

# Store the index in memory:
directory = lucene.RAMDirectory()
# To store an index on disk, use this instead:
#Directory directory = FSDirectory.open(lucene.File("/tmp/testindex"))
iwriter = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength(25000))
doc = lucene.Document()
text = "This is the text to be indexed."
doc.add(lucene.Field("fieldname", text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
iwriter.addDocument(doc)
iwriter.close()

# Now search the index:
isearcher = lucene.IndexSearcher(directory)
# Parse a simple query that searches for "text":
parser = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "fieldname", analyzer)
query = parser.parse("text")
hits = isearcher.search(query, None, 1000).scoreDocs
assert len(hits) == 1
# Iterate through the results:
for hit in hits:
    hitDoc = isearcher.doc(hit.doc)
    assert hitDoc['fieldname'] == text
isearcher.close()
directory.close()

### lupyne ###

# Store the index in memory:
indexer = engine.Indexer()              # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults
indexer.set('fieldname', store=True)    # settings for all documents of indexer; analyzed is the default
indexer.add(fieldname=text)             # add document
indexer.commit()                        # commit changes and refresh searcher

# Now search the index:
hits = indexer.search('text', field='fieldname')    # parsing handled if necessary
assert len(hits) == 1
for hit in hits:                                    # hits support mapping interface
    assert hit['fieldname'] == text
# closing is handled automatically

queries

"""
Convenient Query creation.

Operator overloading is used for combining boolean clauses.
"""

import lucene
lucene.initVM()
from lupyne.engine import Query

### lucene ###

q1 = lucene.TermQuery(lucene.Term('text', 'lucene'))
q2 = lucene.PhraseQuery()
q2.add(lucene.Term('text', 'search'))
q2.add(lucene.Term('text', 'engine'))
q3 = lucene.BooleanQuery()
q3.add(q1, lucene.BooleanClause.Occur.MUST)
q3.add(q2, lucene.BooleanClause.Occur.MUST)
assert str(q3) == '+text:lucene +text:"search engine"'

q1 = lucene.SpanTermQuery(lucene.Term('text', 'hello'))
q2 = lucene.SpanFirstQuery(q1, 10)
q3 = lucene.SpanNotQuery(q1, q2)
assert str(q3) == 'spanNot(text:hello, spanFirst(text:hello, 10))'

### lupyne ###

q = Query.term('text', 'lucene') & Query.phrase('text', 'search', 'engine')
assert isinstance(q, lucene.BooleanQuery)
assert str(q) == '+text:lucene +text:"search engine"'

q = Query.span('text', 'hello')
q -= q[:10]
assert str(q) == 'spanNot(text:hello, spanFirst(text:hello, 10))'

searching

"""
Advanced searching with custom fields.

Prefix and Range queries are a potential pitfall in Lucene.
As the queries expand to more terms, the performance drops off precipitously.
A common example is where datetimes are indexed, but a large span of date ranges are being searched.
The usual workaround is to only index the amount of granularity needed, e.g., just the dates.
But this may not be sufficient, or the datetimes may be necessary for other searches.

The general solution is to index the term values into a prefix tree.
Then each query can expand to only values of the appropriate granularity.
Lucene's NumericFields encode numbers to be sortable, so it is also able to cluster prefixes into the same field.
Whereas Lupyne's NestedField assumes the value is already a sortable string, so different fields must be used to cluster the prefixes.
There are trade-offs to each approach:
 * NumericFields support range queries natively, but must translate prefix queries.
 * NestedFields support prefix queries optimally, but must translate range queries.
 * NumericFields only support numbers, and result in unreadable values in the index.
 * NestedFields support any searchable values, but pollute the field namespace.

Lupyne PointFields and DateTimeFields are now implemented as NumericFields since both are easily encoded as numbers.
NestedFields could still be used however, as demonstrated on dates below.
"""

from datetime import date
import lucene
lucene.initVM()
from lupyne import engine

docs = [
    {'city': 'San Francisco', 'state': 'CA', 'incorporated': '1850-04-15', 'population': 808976, 'longitude': -122.4192, 'latitude': 37.7752},
    {'city': 'Los Angeles', 'state': 'CA', 'incorporated': '1850-04-04', 'population': 3849378, 'longitude': -118.2434, 'latitude': 34.0521},
    {'city': 'Portland', 'state': 'OR', 'incorporated': '1851-02-08', 'population': 575930, 'longitude': -122.6703, 'latitude': 45.5238},
]

indexer = engine.Indexer()
indexer.set('city', store=True, index=False)
indexer.set('state', store=True, index=False)
# set method supports custom field types inheriting their default settings
indexer.set('incorporated', engine.DateTimeField)
indexer.set('year-month-day', engine.NestedField, sep='-')
indexer.set('population', engine.NumericField)
indexer.set('point', engine.PointField, precision=10)
# assigned fields can have a different key from their underlying field name
indexer.fields['location'] = engine.NestedField('state.city')

for doc in docs:
    doc['year-month-day'] = doc['incorporated']
    point = doc.pop('longitude'), doc.pop('latitude')
    location = doc['state'] + '.' + doc['city']
    incorporated = map(int, doc.pop('incorporated').split('-'))
    indexer.add(doc, location=location, incorporated=date(*incorporated), point=[point])
indexer.commit()

query = indexer.fields['incorporated'].prefix([1850])
assert query.max.doubleValue() - query.min.doubleValue() == 60 * 60 * 24 * 365
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']
query = indexer.fields['incorporated'].range(date(1850, 4, 10), None)
assert query.max is None
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['year-month-day'].prefix('1850')
assert str(query) == 'year:1850*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']
query = indexer.fields['year-month-day'].range('1850-04-10', None)
assert str(query) == 'year-month-day:[1850-04-10 TO *}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['population'].range(0, 1000000)
assert str(query) == 'population:[0 TO 1000000}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

cities = ['San Francisco', 'Los Angeles', 'Portland']
for index, distance in enumerate([1e3, 1e5, 2e5, 1e6]):
    query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)
    assert isinstance(query, lucene.BooleanQuery) and len(query) <= 4
    assert set(hit['city'] for hit in indexer.search(query)) == set(cities[:index])

query = indexer.fields['location'].prefix('CA.San')
# works like any prefix query
assert str(query) == 'state.city:CA.San*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco']
query = indexer.fields['location'].prefix('CA')
# optimized to search the best field
assert str(query) == 'state:CA*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']

sorting

"""
PyLucene has several pitfalls when collecting or sorting a large query result.
Generally they involve the overhead of traversing the VM in an internal loop.

Lucene's performance itself drops off noticeably as the requested doc count increases.
This is heavily compounded by having to iterate through a large set of ScoreDocs in PyLucene.

Lucene also only supports sorting a query result when a doc count is supplied.
And supplying an excessively large count is not a good workaround because of the aforementioned problem.

Finally the custom sorting interface, although well-supported in PyLucene, is bascially useless.
The sort key of every potential doc must realistically be cached anyway,
but the performance overhead of O(n log n) comparison calls in java is still horrid.

To mitigate all these problems, Lupyne first provides a unified search interface.
The same Hits type is returned regardless of whether a doc count is supplied.
As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.
Internally an optimized custom hit Collector is used when all docs are requested.

The search method does allow lucene Sort parameters to be passed through, since that's still optimal.
So the only gotcha is that with no doc count the sort parameter must instead be a python callable key.
The IndexReader.comparator method is convenient for creating a sort key table from indexed fields.
The upshot is custom sorting and sorting large results are both easier and faster.

Custom sorting isn't necessary in the below example of course, just there for demonstration.
"""

import lucene
lucene.initVM()
from lupyne import engine

colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'
indexer = engine.Indexer()
indexer.set('color', store=True, index=True)
for color in colors:
    indexer.add(color=color)
indexer.commit()

### lucene ###

searcher = lucene.IndexSearcher(indexer.directory)
topdocs = searcher.search(lucene.MatchAllDocsQuery(), None, 10, lucene.Sort(lucene.SortField('color', lucene.SortField.STRING)))
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

class ComparatorSource(lucene.PythonFieldComparatorSource):
    class newComparator(lucene.PythonFieldComparator):
        def __init__(self, name, numHits, sortPos, reversed):
            lucene.PythonFieldComparator.__init__(self)
            self.name = name
            self.values = [None] * numHits
        def setNextReader(self, reader, base):
            self.comparator = lucene.FieldCache.DEFAULT.getStrings(reader, self.name)
        def compare(self, slot1, slot2):
            return cmp(self.values[slot1], self.values[slot2])
        def setBottom(self, slot):
            self._bottom = self.values[slot]
        def compareBottom(self, doc):
            return cmp(self._bottom, self.comparator[doc])
        def copy(self, slot, doc):
            self.values[slot] = self.comparator[doc]
        def value(self, slot):
            return lucene.String()

sorter = lucene.Sort(lucene.SortField('color', ComparatorSource()))
# still must supply excessive doc count to use the sorter
topdocs = searcher.search(lucene.MatchAllDocsQuery(), None, 10, sorter)
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

### lupyne ###

hits = indexer.search(count=10, sort='color')
assert [hit['color'] for hit in hits] == sorted(colors)
comparator = indexer.comparator('color')
assert list(comparator) == list(colors)
hits = indexer.search(sort=comparator.__getitem__)
assert [hit['color'] for hit in hits] == sorted(colors)

server

"""
Custom server.

Fields settings are assigned directly to the root.
Indexing is done here just to populate the example.

A custom filter and sorter are demonstrated by transforming a date field into a year field.
Filters are also used for faceting;  sorters are also used for grouping.

Example queries:
 * http://localhost:8080/search?q=date:17*&group=year
 * http://localhost:8080/search?q=date:17*&group=year&sort=-year
 * http://localhost:8080/search?count=0&facets=year
 * http://localhost:8080/search?q=text:right&count=3&facets=year
"""

import os
import lucene
from lupyne import engine, server
from test import fixture

if __name__ == '__main__':
    lucene.initVM(vmargs='-Xrs')
    root = server.WebIndexer()
    # assign field settings
    root.indexer.set('amendment', store=True, index=True)
    root.indexer.set('date', store=True, index=True)
    root.indexer.set('text')
    # populate index
    for doc in fixture.constitution.docs():
        if 'amendment' in doc:
            root.indexer.add(doc)
    root.update()
    # assign custom filter and sorter based on year
    root.searcher.sorters['year'] = engine.SortField('date', int, lambda date: int(date.split('-')[0]))
    years = set(date.split('-')[0] for date in root.searcher.terms('date'))
    root.searcher.filters['year'] = dict((year, engine.Query.prefix('date', year).filter()) for year in years)
    # start with pretty-printing
    server.start(root, config={'global': {'tools.json.indent': 2}})

Table Of Contents

Previous topic

client

This Page