examples

Contrasting examples of the lucene api versus pythonic lupyne idioms.

indexers

"""
Basic indexing and searching example adapted from http://lucene.apache.org/java/2_9_1/api/core/index.html
Compatible with lucene versions 2.4 through 3.0.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne import engine

### lucene ###

analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT) if hasattr(lucene, 'Version') else lucene.StandardAnalyzer()

# Store the index in memory:
directory = lucene.RAMDirectory()
# To store an index on disk, use this instead:
#Directory directory = FSDirectory.open(lucene.File("/tmp/testindex"))
iwriter = lucene.IndexWriter(directory, analyzer, True, lucene.IndexWriter.MaxFieldLength(25000))
doc = lucene.Document()
text = "This is the text to be indexed."
doc.add(lucene.Field("fieldname", text, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
iwriter.addDocument(doc)
iwriter.close()

# Now search the index:
isearcher = lucene.IndexSearcher(directory)
# Parse a simple query that searches for "text":
parser = lucene.QueryParser(lucene.Version.LUCENE_CURRENT, "fieldname", analyzer) if hasattr(lucene, 'Version') else lucene.QueryParser("fieldname", analyzer)
query = parser.parse("text")
hits = isearcher.search(query, None, 1000).scoreDocs
assert len(hits) == 1
# Iterate through the results:
for hit in hits:
    hitDoc = isearcher.doc(hit.doc)
    assert hitDoc['fieldname'] == text
isearcher.close()
directory.close()

### lupyne ###

# Store the index in memory:
indexer = engine.Indexer()              # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults
indexer.set('fieldname', store=True)    # settings for all documents of indexer; tokenized is the default
indexer.add(fieldname=text)             # add document
indexer.commit()                        # commit changes and refresh searcher

# Now search the index:
hits = indexer.search('text', field='fieldname')    # parsing handled if necessary
assert len(hits) == 1
for hit in hits:                                    # hits support mapping interface
    assert hit['fieldname'] == text
# closing is handled automatically

queries

"""
Convenient Query creation.

Operator overloading is used for combining boolean clauses.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne.engine import Query

### lucene ###

q1 = lucene.TermQuery(lucene.Term('text', 'lucene'))
q2 = lucene.PhraseQuery()
q2.add(lucene.Term('text', 'search'))
q2.add(lucene.Term('text', 'engine'))
q3 = lucene.BooleanQuery()
q3.add(q1, lucene.BooleanClause.Occur.MUST)
q3.add(q2, lucene.BooleanClause.Occur.MUST)
assert str(q3) == '+text:lucene +text:"search engine"'

q1 = lucene.SpanTermQuery(lucene.Term('text', 'hello'))
q2 = lucene.SpanFirstQuery(q1, 10)
q3 = lucene.SpanNotQuery(q1, q2)
assert str(q3) == 'spanNot(text:hello, spanFirst(text:hello, 10))'

### lupyne ###

q = Query.term('text', 'lucene') & Query.phrase('text', 'search', 'engine')
assert isinstance(q, lucene.BooleanQuery)
assert str(q) == '+text:lucene +text:"search engine"'

q = Query.span('text', 'hello')
q -= q[:10]
assert str(q) == 'spanNot(text:hello, spanFirst(text:hello, 10))'

searching

"""
Advanced searching with custom fields.

Prefix and Range queries are a potential pitfall in Lucene.
As the queries expand to more terms, the performance drops off precipitously.

A common example is where datetimes are indexed, but a large span of date ranges are being searched.
The usual workaround is to only index the amount of granularity needed, e.g., just the dates.
But this may not be sufficient, or the datetimes may be necessary for other searches.

In any case the principle can be generalized to indexing every prefix of a term.
The cost in indexing time and space is well worth the optimal search times.

LuPyne's PrefixField automates the indexing of such prefix trees into different fields.
The default naming convention makes each field look like a python slice of the original field.
The fields also provide prefix and range query generators that optimally utilize the underlying fields.

NestedFields extend PrefixFields to support a common separator.
DateTimeFields extend PrefixFields with datetime specific query generators.
PointFields extend PrefixFields to support geospatial queries.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne import engine

docs = [
    {'city': 'San Francisco', 'state': 'CA', 'incorporated': '1850-04-15', 'population': '0,808,976', 'longitude': -122.4192, 'latitude': 37.7752},
    {'city': 'Los Angeles', 'state': 'CA', 'incorporated': '1850-04-04', 'population': '3,849,378', 'longitude': -118.2434, 'latitude': 34.0521},
    {'city': 'Portland', 'state': 'OR', 'incorporated': '1851-02-08', 'population': '0,575,930', 'longitude': -122.6703, 'latitude': 45.5238},
]

indexer = engine.Indexer()
indexer.set('city', store=True, index=False)
indexer.set('state', store=True, index=False)
# set method supports custom field types inheriting their default settings
indexer.set('incorporated', engine.DateTimeField)
indexer.set('population', engine.PrefixField)
indexer.set('point', engine.PointField, precision=10)
# assigned fields can have a different key from their underlying field name
indexer.fields['location'] = engine.NestedField('state:city')

for doc in docs:
    location = doc['state'] + ':' + doc['city']
    point = doc.pop('longitude'), doc.pop('latitude')
    indexer.add(doc, location=location, point=[point])
indexer.commit()

query = indexer.fields['incorporated'].range('1800', '1851-02-07')
# automatically handles date arithmetic to compute an optimal boolean (OR) query
assert str(query) == 'incorporated:Y:[1800 TO 1851} incorporated:Ym:[1851 TO 1851-02} incorporated:Ymd:[1851-02 TO 1851-02-07}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']

query = indexer.fields['population'].range('0', '1,000,000')
# works like any range query
assert str(query) == 'population[:9]:[0 TO 1,000,000}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']
query = indexer.fields['population'].range('0', '1')
# optimized to search the best field
assert str(query) == 'population[:1]:[0 TO 1}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['location'].prefix('CA:San')
# works like any prefix query
assert str(query) == 'state:city:CA:San*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco']
query = indexer.fields['location'].prefix('CA')
# optimized to search the best field
assert str(query) == 'state:CA*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']

cities = ['San Francisco', 'Los Angeles', 'Portland']
for index, distance in enumerate([1e3, 1e5, 2e5, 1e6]):
    query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)
    assert isinstance(query, lucene.BooleanQuery) and len(query) <= 4
    assert set(hit['city'] for hit in indexer.search(query)) == set(cities[:index])

sorting

"""
PyLucene has several pitfalls when collecting or sorting a large query result.
Generally they involve the overhead of traversing the VM in an internal loop.

Lucene's performance itself drops off noticeably as the requested doc count increases.
This is heavily compounded by having to iterate through a large set of ScoreDocs in PyLucene.

Lucene also only supports sorting a query result when a doc count is supplied.
And supplying an excessively large count is not a good workaround because of the aforementioned problem.

Finally the custom sorting interface, although well-supported in PyLucene, is bascially useless.
The sort key of every potential doc must realistically be cached anyway,
but the performance overhead of O(n log n) comparison calls in java is still horrid.

To mitigate all these problems, LuPyne first provides a unified search interface.
The same Hits type is returned regardless of whether a doc count is supplied.
As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.
Internally an optimized custom hit Collector is used when all docs are requested.

The search method does allow lucene Sort parameters to be passed through, since that's still optimal.
So the only gotcha is that with no doc count the sort parameter must instead be a python callable key.
The IndexReader.comparator method is convenient for creating a sort key table from indexed fields.
The upshot is custom sorting and sorting large results are both easier and faster.

Custom sorting isn't necessary in the below example of course, just there for demonstration.
Compatible with lucene versions 2.4 through 3.0.
"""

import lucene
lucene.initVM(lucene.CLASSPATH)
from lupyne import engine

colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'
indexer = engine.Indexer()
indexer.set('color', store=True, index=True)
for color in colors:
    indexer.add(color=color)
indexer.commit()

### lucene ###

searcher = lucene.IndexSearcher(indexer.directory)
topdocs = searcher.search(lucene.MatchAllDocsQuery(), None, 10, lucene.Sort(lucene.SortField('color', lucene.SortField.STRING)))
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

if hasattr(lucene, 'PythonFieldComparatorSource'):
    class ComparatorSource(lucene.PythonFieldComparatorSource):
        class newComparator(lucene.PythonFieldComparator):
            def __init__(self, name, numHits, sortPos, reversed):
                lucene.PythonFieldComparator.__init__(self)
                self.name = name
                self.comparator = []
            def setNextReader(self, reader, base):
                self.base = base
                self.comparator += lucene.FieldCache.DEFAULT.getStrings(reader, self.name)
            def compare(self, slot1, slot2):
                return cmp(self.comparator[slot1], self.comparator[slot2])
            def setBottom(self, slot):
                self._bottom = self.comparator[slot]
            def compareBottom(self, doc):
                return cmp(self._bottom, self.comparator[doc + self.base])
            def copy(self, slot, doc):
                pass
            def value(self, slot):
                return lucene.String()
else:
    class ComparatorSource(lucene.PythonSortComparatorSource):
        class newComparator(lucene.PythonScoreDocComparator):
            def __init__(self, reader, name):
                lucene.PythonScoreDocComparator.__init__(self)
                self.comparator = list(lucene.FieldCache.DEFAULT.getStrings(reader, name))
            def compare(self, i, j):
                return cmp(self.comparator[i.doc], self.comparator[j.doc])
            def sortValue(self, i):
                 return lucene.String()
            def sortType(self):
                return lucene.SortField.STRING

sorter = lucene.Sort(lucene.SortField('color', ComparatorSource()))
# still must supply excessive doc count to use the sorter
topdocs = searcher.search(lucene.MatchAllDocsQuery(), None, 10, sorter)
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

### lupyne ###

hits = indexer.search(count=10, sort='color')
assert [hit['color'] for hit in hits] == sorted(colors)
comparator = indexer.comparator('color')
assert list(comparator) == list(colors)
hits = indexer.search(sort=comparator.__getitem__)
assert [hit['color'] for hit in hits] == sorted(colors)

Table Of Contents

Previous topic

client

This Page