examples

Contrasting examples of the lucene api versus pythonic lupyne idioms.

indexers

"""
Basic indexing and searching example adapted from http://lucene.apache.org/java/2_9_1/api/core/index.html
Compatible with lucene versions 2.4 through 3.0.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne import engine

### lucene ###

analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) if hasattr(lucene, 'Version') else lucene.StandardAnalyzer()

# Store the index in memory:
directory = lucene.RAMDirectory()
# To store an index on disk, use this instead:
#Directory directory = FSDirectory.open(lucene.File("/tmp/testindex"))
iwriter = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength(25000))
doc = lucene.Document()
text = "This is the text to be indexed."
doc.add(lucene.Field("fieldname", text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
iwriter.addDocument(doc)
iwriter.close()

# Now search the index:
isearcher = lucene.IndexSearcher(directory)
# Parse a simple query that searches for "text":
parser = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "fieldname", analyzer) if hasattr(lucene, 'Version') else lucene.QueryParser("fieldname", analyzer)
query = parser.parse("text")
hits = isearcher.search(query, None, 1000).scoreDocs
assert len(hits) == 1
# Iterate through the results:
for hit in hits:
    hitDoc = isearcher.doc(hit.doc)
    assert hitDoc['fieldname'] == text
isearcher.close()
directory.close()

### lupyne ###

# Store the index in memory:
indexer = engine.Indexer()              # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults
indexer.set('fieldname', store=True)    # settings for all documents of indexer; tokenized is the default
indexer.add(fieldname=text)             # add document
indexer.commit()                        # commit changes and refresh searcher

# Now search the index:
hits = indexer.search('text', field='fieldname')    # parsing handled if necessary
assert len(hits) == 1
for hit in hits:                                    # hits support mapping interface
    assert hit['fieldname'] == text
# closing is handled automatically

numeric

"""
Numeric fields.

Lupyne's PrefixField was created before Lucene's NumericField was added in version 2.9.
Both support indexing a prefix tree of values, in order to optimize range and prefix queries, but use different approaches.

NumericFields encode numbers to be sortable, so it is also able to cluster prefixes into the same field.
Whereas PrefixField assumes the value is already a sortable string, so different fields must be used to cluster the prefixes.
There are trade-offs to each approach:
 * NumericFields support range queries natively, but must translate prefix queries.
 * PrefixFields support prefix queries optimally, but must translate range queries.
 * NumericFields only support numbers, and result in unreadable values in the index.
 * PrefixFields support any searchable values, but pollute the field namespace.

Spatial and datetime fields are two common examples that need prefix tree support.
Currently SpatialFields and DateTimeFields are based on PrefixFields, but have an alternate NumericField implementation.
Because both are easily encodable as numbers, the plan is to make the numeric implementation the default when support for 2.4 is dropped.

So the long term support for PrefixField is unclear, although sometimes it is convenient to index into different fields.
For example, breaking datetimes into their components makes searching by year optimal, and it's easier to introspect the index.
There will be continued support for NestedFields, which allow arbitrary compound indexes similar to a relational database.
See the state:city field in the searching example.
"""

from datetime import date, datetime
import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne import engine

docs = [
    {'city': 'San Francisco', 'incorporated': '1850-04-15', 'population': 808976, 'longitude': -122.4192, 'latitude': 37.7752},
    {'city': 'Los Angeles', 'incorporated': '1850-04-04', 'population': 3849378, 'longitude': -118.2434, 'latitude': 34.0521},
    {'city': 'Portland', 'incorporated': '1851-02-08', 'population': 575930, 'longitude': -122.6703, 'latitude': 45.5238},
]

indexer = engine.Indexer()
indexer.set('city', store=True, index=False)
indexer.set('incorporated', engine.numeric.DateTimeField)
indexer.set('population', engine.numeric.NumericField)
indexer.set('point', engine.numeric.PointField, precision=10)

for doc in docs:
    point = doc.pop('longitude'), doc.pop('latitude')
    incorporated = map(int, doc.pop('incorporated').split('-'))
    indexer.add(doc, incorporated=date(*incorporated), point=[point])
indexer.commit()

query = indexer.fields['incorporated'].prefix([1850])
assert query.max.doubleValue() - query.min.doubleValue() == 60 * 60 * 24 * 365
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']
query = indexer.fields['incorporated'].range(date(1850, 4, 10), None)
assert query.max is None
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['population'].range(0, 1000000)
assert str(query) == 'population:[0 TO 1000000}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

cities = ['San Francisco', 'Los Angeles', 'Portland']
for index, distance in enumerate([1e3, 1e5, 2e5, 1e6]):
    query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)
    assert isinstance(query, lucene.BooleanQuery) and len(query) <= 4
    assert set(hit['city'] for hit in indexer.search(query)) == set(cities[:index])

queries

"""
Convenient Query creation.

Operator overloading is used for combining boolean clauses.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne.engine import Query

### lucene ###

q1 = lucene.TermQuery(lucene.Term('text', 'lucene'))
q2 = lucene.PhraseQuery()
q2.add(lucene.Term('text', 'search'))
q2.add(lucene.Term('text', 'engine'))
q3 = lucene.BooleanQuery()
q3.add(q1, lucene.BooleanClause.Occur.MUST)
q3.add(q2, lucene.BooleanClause.Occur.MUST)
assert str(q3) == '+text:lucene +text:"search engine"'

q1 = lucene.SpanTermQuery(lucene.Term('text', 'hello'))
q2 = lucene.SpanFirstQuery(q1, 10)
q3 = lucene.SpanNotQuery(q1, q2)
assert str(q3) == 'spanNot(text:hello, spanFirst(text:hello, 10))'

### lupyne ###

q = Query.term('text', 'lucene') & Query.phrase('text', 'search', 'engine')
assert isinstance(q, lucene.BooleanQuery)
assert str(q) == '+text:lucene +text:"search engine"'

q = Query.span('text', 'hello')
q -= q[:10]
assert str(q) == 'spanNot(text:hello, spanFirst(text:hello, 10))'

searching

"""
Advanced searching with custom fields.

Prefix and Range queries are a potential pitfall in Lucene.
As the queries expand to more terms, the performance drops off precipitously.

A common example is where datetimes are indexed, but a large span of date ranges are being searched.
The usual workaround is to only index the amount of granularity needed, e.g., just the dates.
But this may not be sufficient, or the datetimes may be necessary for other searches.

In any case the principle can be generalized to indexing every prefix of a term.
The cost in indexing time and space is well worth the optimal search times.

Lupyne's PrefixField automates the indexing of such prefix trees into different fields.
The default naming convention makes each field look like a python slice of the original field.
The fields also provide prefix and range query generators that optimally utilize the underlying fields.

NestedFields extend PrefixFields to support a common separator.
DateTimeFields extend PrefixFields with datetime specific query generators.
PointFields extend PrefixFields to support geospatial queries.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne import engine

docs = [
    {'city': 'San Francisco', 'state': 'CA', 'incorporated': '1850-04-15', 'population': '0,808,976', 'longitude': -122.4192, 'latitude': 37.7752},
    {'city': 'Los Angeles', 'state': 'CA', 'incorporated': '1850-04-04', 'population': '3,849,378', 'longitude': -118.2434, 'latitude': 34.0521},
    {'city': 'Portland', 'state': 'OR', 'incorporated': '1851-02-08', 'population': '0,575,930', 'longitude': -122.6703, 'latitude': 45.5238},
]

indexer = engine.Indexer()
indexer.set('city', store=True, index=False)
indexer.set('state', store=True, index=False)
# set method supports custom field types inheriting their default settings
indexer.set('incorporated', engine.DateTimeField)
indexer.set('population', engine.PrefixField)
indexer.set('point', engine.PointField, precision=10)
# assigned fields can have a different key from their underlying field name
indexer.fields['location'] = engine.NestedField('state:city')

for doc in docs:
    location = doc['state'] + ':' + doc['city']
    point = doc.pop('longitude'), doc.pop('latitude')
    indexer.add(doc, location=location, point=[point])
indexer.commit()

query = indexer.fields['incorporated'].range('1800', '1851-02-07')
# automatically handles date arithmetic to compute an optimal boolean (OR) query
assert str(query) == 'incorporated:Y:[1800 TO 1851} incorporated:Ym:[1851 TO 1851-02} incorporated:Ymd:[1851-02 TO 1851-02-07}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']

query = indexer.fields['population'].range('0', '1,000,000')
# works like any range query
assert str(query) == 'population[:9]:[0 TO 1,000,000}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']
query = indexer.fields['population'].range('0', '1')
# optimized to search the best field
assert str(query) == 'population[:1]:[0 TO 1}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['location'].prefix('CA:San')
# works like any prefix query
assert str(query) == 'state:city:CA:San*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco']
query = indexer.fields['location'].prefix('CA')
# optimized to search the best field
assert str(query) == 'state:CA*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']

cities = ['San Francisco', 'Los Angeles', 'Portland']
for index, distance in enumerate([1e3, 1e5, 2e5, 1e6]):
    query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)
    assert isinstance(query, lucene.BooleanQuery) and len(query) <= 4
    assert set(hit['city'] for hit in indexer.search(query)) == set(cities[:index])

sorting

"""
PyLucene has several pitfalls when collecting or sorting a large query result.
Generally they involve the overhead of traversing the VM in an internal loop.

Lucene's performance itself drops off noticeably as the requested doc count increases.
This is heavily compounded by having to iterate through a large set of ScoreDocs in PyLucene.

Lucene also only supports sorting a query result when a doc count is supplied.
And supplying an excessively large count is not a good workaround because of the aforementioned problem.

Finally the custom sorting interface, although well-supported in PyLucene, is bascially useless.
The sort key of every potential doc must realistically be cached anyway,
but the performance overhead of O(n log n) comparison calls in java is still horrid.

To mitigate all these problems, Lupyne first provides a unified search interface.
The same Hits type is returned regardless of whether a doc count is supplied.
As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.
Internally an optimized custom hit Collector is used when all docs are requested.

The search method does allow lucene Sort parameters to be passed through, since that's still optimal.
So the only gotcha is that with no doc count the sort parameter must instead be a python callable key.
The IndexReader.comparator method is convenient for creating a sort key table from indexed fields.
The upshot is custom sorting and sorting large results are both easier and faster.

Custom sorting isn't necessary in the below example of course, just there for demonstration.
Compatible with lucene versions 2.4 through 3.0.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne import engine

colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'
indexer = engine.Indexer()
indexer.set('color', store=True, index=True)
for color in colors:
    indexer.add(color=color)
indexer.commit()

### lucene ###

searcher = lucene.IndexSearcher(indexer.directory)
topdocs = searcher.search(lucene.MatchAllDocsQuery(), None, 10, lucene.Sort(lucene.SortField('color', lucene.SortField.STRING)))
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

if hasattr(lucene, 'PythonFieldComparatorSource'):
    class ComparatorSource(lucene.PythonFieldComparatorSource):
        class newComparator(lucene.PythonFieldComparator):
            def __init__(self, name, numHits, sortPos, reversed):
                lucene.PythonFieldComparator.__init__(self)
                self.name = name
                self.values = [None] * numHits
            def setNextReader(self, reader, base):
                self.comparator = lucene.FieldCache.DEFAULT.getStrings(reader, self.name)
            def compare(self, slot1, slot2):
                return cmp(self.values[slot1], self.values[slot2])
            def setBottom(self, slot):
                self._bottom = self.values[slot]
            def compareBottom(self, doc):
                return cmp(self._bottom, self.comparator[doc])
            def copy(self, slot, doc):
                self.values[slot] = self.comparator[doc]
            def value(self, slot):
                return lucene.String()
else:
    class ComparatorSource(lucene.PythonSortComparatorSource):
        class newComparator(lucene.PythonScoreDocComparator):
            def __init__(self, reader, name):
                lucene.PythonScoreDocComparator.__init__(self)
                self.comparator = lucene.FieldCache.DEFAULT.getStrings(reader, name)
            def compare(self, i, j):
                return cmp(self.comparator[i.doc], self.comparator[j.doc])
            def sortValue(self, i):
                return lucene.String()
            def sortType(self):
                return lucene.SortField.STRING

sorter = lucene.Sort(lucene.SortField('color', ComparatorSource()))
# still must supply excessive doc count to use the sorter
topdocs = searcher.search(lucene.MatchAllDocsQuery(), None, 10, sorter)
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

### lupyne ###

hits = indexer.search(count=10, sort='color')
assert [hit['color'] for hit in hits] == sorted(colors)
comparator = indexer.comparator('color')
assert list(comparator) == list(colors)
hits = indexer.search(sort=comparator.__getitem__)
assert [hit['color'] for hit in hits] == sorted(colors)

Table Of Contents

Previous topic

client

This Page