Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix filter and ordering issues with FloatField and IntegerField #186

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/xapian_tests/migrations/0001_initial.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class Migration(migrations.Migration):
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('type_name', models.CharField(max_length=50)),
('number', models.IntegerField()),
('float_number', models.FloatField()),
('name', models.CharField(max_length=200)),
('date', models.DateField()),
('summary', models.TextField()),
Expand Down
1 change: 1 addition & 0 deletions tests/xapian_tests/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
class Document(models.Model):
type_name = models.CharField(max_length=50)
number = models.IntegerField()
float_number = models.FloatField()
name = models.CharField(max_length=200)

date = models.DateField()
Expand Down
1 change: 1 addition & 0 deletions tests/xapian_tests/search_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class DocumentIndex(indexes.SearchIndex, indexes.Indexable):
type_name = indexes.CharField(model_attr='type_name')

number = indexes.IntegerField(model_attr='number')
float_number = indexes.FloatField(model_attr='float_number')

name = indexes.CharField(model_attr='name')
date = indexes.DateField(model_attr='date')
Expand Down
19 changes: 10 additions & 9 deletions tests/xapian_tests/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,9 +559,10 @@ def test_verify_type(self):

def test_term_to_xapian_value(self):
self.assertEqual(_term_to_xapian_value('abc', 'text'), 'abc')
self.assertEqual(_term_to_xapian_value(1, 'integer'), '000000000001')
self.assertEqual(_term_to_xapian_value(2653, 'integer'), '000000002653')
self.assertEqual(_term_to_xapian_value(25.5, 'float'), b'\xb2`')
self.assertEqual(_term_to_xapian_value(-1337, 'integer'), '316380')
self.assertEqual(_term_to_xapian_value(1, 'integer'), 'a0')
self.assertEqual(_term_to_xapian_value(2653, 'integer'), 'd12e80')
self.assertEqual(_term_to_xapian_value(25.5, 'float'), 'b260')
self.assertEqual(_term_to_xapian_value([1, 2, 3], 'text'), '[1, 2, 3]')
self.assertEqual(_term_to_xapian_value((1, 2, 3), 'text'), '(1, 2, 3)')
self.assertEqual(_term_to_xapian_value({'a': 1, 'c': 3, 'b': 2}, 'text'),
Expand Down Expand Up @@ -627,18 +628,18 @@ def test_parse_query_range(self):
])
self.assertExpectedQuery(self.backend.parse_query('number:0..10'),
[
'0 * VALUE_RANGE 11 000000000000 000000000010',
'VALUE_RANGE 11 000000000000 000000000010',
'0 * VALUE_RANGE 11 80 ad',
'VALUE_RANGE 11 80 ad',
])
self.assertExpectedQuery(self.backend.parse_query('number:..10'),
[
'0 * VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1),
'VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1),
'0 * VALUE_LE 11 ad',
'VALUE_LE 11 ad',
])
self.assertExpectedQuery(self.backend.parse_query('number:10..*'),
[
'0 * VALUE_RANGE 11 000000000010 %012d' % sys.maxsize,
'VALUE_RANGE 11 000000000010 %012d' % sys.maxsize,
'0 * VALUE_RANGE 11 ad ffffffffffffffffff',
'VALUE_RANGE 11 ad ffffffffffffffffff',
])

def test_order_by_django_id(self):
Expand Down
16 changes: 14 additions & 2 deletions tests/xapian_tests/tests/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ def setUp(self):
for i in range(1, 13):
doc = Document()
doc.type_name = types_names[i % 3]
doc.number = i * 2
doc.name = "%s %d" % (doc.type_name, doc.number)
doc.number = (i - 7) * 100
doc.float_number = (i - 6.5) * 2
doc.name = "%s %d" % (doc.type_name, i * 2)
doc.date = dates[i % 3]

doc.summary = summaries[i % 3]
Expand Down Expand Up @@ -158,6 +159,15 @@ def test_value_range(self):
self.assertEqual(set(pks(self.queryset.filter(number__lt=3))),
set(pks(Document.objects.filter(number__lt=3))))

self.assertEqual(set(pks(self.queryset.filter(float_number__lte=float('inf')))),
set(pks(Document.objects.filter(float_number__lte=float('inf')))))
self.assertEqual(set(pks(self.queryset.filter(float_number__lt=3.5))),
set(pks(Document.objects.filter(float_number__lt=3.5))))
self.assertEqual(set(pks(self.queryset.filter(float_number__gt=3.5))),
set(pks(Document.objects.filter(float_number__gt=3.5))))
self.assertEqual(set(pks(self.queryset.filter(float_number__gte=float('-inf')))),
set(pks(Document.objects.filter(float_number__gte=float('-inf')))))

self.assertEqual(set(pks(self.queryset.filter(django_id__gte=6))),
set(pks(Document.objects.filter(id__gte=6))))

Expand All @@ -178,6 +188,8 @@ def test_order_by(self):
# value order
self.assertEqual(pks(self.queryset.order_by("number")),
pks(Document.objects.order_by("number")))
self.assertEqual(pks(self.queryset.order_by("float_number")),
pks(Document.objects.order_by("float_number")))

# text order
self.assertEqual(pks(self.queryset.order_by("summary")),
Expand Down
31 changes: 12 additions & 19 deletions tests/xapian_tests/tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,34 +278,29 @@ def test_endswith(self):
def test_gt(self):
self.sq.add_filter(SQ(name__gt='m'))
self.assertExpectedQuery(self.sq.build_query(),
'(<alldocuments> AND_NOT VALUE_RANGE 3 a m)')
'(<alldocuments> AND_NOT VALUE_LE 3 m)')

def test_gte(self):
self.sq.add_filter(SQ(name__gte='m'))
self.assertExpectedQuery(self.sq.build_query(),
'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzz')
'VALUE_GE 3 m')

def test_lt(self):
self.sq.add_filter(SQ(name__lt='m'))
self.assertExpectedQuery(self.sq.build_query(),
'(<alldocuments> AND_NOT VALUE_RANGE 3 m '
'zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzz)')
'(<alldocuments> AND_NOT VALUE_GE 3 m)')

def test_lte(self):
self.sq.add_filter(SQ(name__lte='m'))
self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 3 a m')
self.assertExpectedQuery(self.sq.build_query(), 'VALUE_LE 3 m')

def test_range(self):
self.sq.add_filter(SQ(django_id__range=[2, 4]))
self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 1 000000000002 000000000004')
self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 1 a4 a8')
self.sq.add_filter(~SQ(django_id__range=[0, 2]))
self.assertExpectedQuery(self.sq.build_query(),
'(VALUE_RANGE 1 000000000002 000000000004 AND '
'(<alldocuments> AND_NOT VALUE_RANGE 1 000000000000 000000000002))')
'(VALUE_RANGE 1 a4 a8 AND '
'(<alldocuments> AND_NOT VALUE_RANGE 1 80 a4))')
self.assertEqual([result.pk for result in self.sq.get_results()], [3])

def test_multiple_filter_types(self):
Expand All @@ -315,13 +310,11 @@ def test_multiple_filter_types(self):
self.sq.add_filter(SQ(title__gte='B'))
self.sq.add_filter(SQ(django_id__in=[1, 2, 3]))
self.assertExpectedQuery(self.sq.build_query(),
'((Zwhi OR why) AND'
' VALUE_RANGE 5 00010101000000 20090210015900 AND'
' (<alldocuments> AND_NOT VALUE_RANGE 3 a david)'
' AND VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzz AND'
' (QQ000000000001 OR QQ000000000002 OR QQ000000000003))')
'((Zwhi OR why) AND '
'VALUE_LE 5 20090210015900 AND '
'(<alldocuments> AND_NOT VALUE_LE 3 david) AND '
'VALUE_GE 7 b AND '
'(QQa0 OR QQa4 OR QQa6))')

def test_log_query(self):
reset_search_queries()
Expand Down
102 changes: 48 additions & 54 deletions xapian_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@
# defines the format used to store types in Xapian
# this format ensures datetimes are sorted correctly
DATETIME_FORMAT = '%Y%m%d%H%M%S'
INTEGER_FORMAT = '%012d'

# defines the distance given between
# texts with positional information
Expand Down Expand Up @@ -102,31 +101,36 @@ def __call__(self, begin, end):
if field_dict['field_name'] == field_name:
field_type = field_dict['type']

if not begin:
if begin:
if field_type == 'float':
begin = _term_to_xapian_value(float(begin), field_type)
elif field_type == 'integer':
begin = _term_to_xapian_value(int(begin), field_type)
else:
if field_type == 'text':
begin = 'a' # TODO: A better way of getting a min text value?
elif field_type == 'integer':
begin = -sys.maxsize - 1
elif field_type == 'float':
begin = float('-inf')
elif field_type in ('float', 'integer'):
# floats and ints are both serialised using xapian.sortable_serialise
# so we can use -Infinity as the lower bound for both of them.
begin = _term_to_xapian_value(float('-inf'), field_type)
elif field_type == 'date' or field_type == 'datetime':
begin = '00010101000000'
elif end == '*':

if end == '*':
if field_type == 'text':
end = 'z' * 100 # TODO: A better way of getting a max text value?
elif field_type == 'integer':
end = sys.maxsize
elif field_type == 'float':
end = float('inf')
elif field_type in ('float', 'integer'):
# floats and ints are both serialised using xapian.sortable_serialise
# so we can use +Infinity as the upper bound for both of them.
end = _term_to_xapian_value(float('inf'), field_type)
elif field_type == 'date' or field_type == 'datetime':
end = '99990101000000'
else:
if field_type == 'float':
end = _term_to_xapian_value(float(end), field_type)
elif field_type == 'integer':
end = _term_to_xapian_value(int(end), field_type)
Comment on lines -105 to +132
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can also simplify the logic for the other field types here. ValueRangeProcessors (which should maybe just be RangeProcessors now?) can return empty values to represent open ranges. In that case we wouldn't have to make up arbitrary lower and upper bounds for text and date values.


if field_type == 'float':
begin = _term_to_xapian_value(float(begin), field_type)
end = _term_to_xapian_value(float(end), field_type)
elif field_type == 'integer':
begin = _term_to_xapian_value(int(begin), field_type)
end = _term_to_xapian_value(int(end), field_type)
return field_dict['column'], str(begin), str(end)


Expand Down Expand Up @@ -942,13 +946,7 @@ def _process_facet_field_spies(self, spies):

facet_dict[field_name] = []
for facet in list(spy.values()):
if field_type == 'float':
# the float term is a Xapian serialized object, which is
# in bytes.
term = facet.term
else:
term = facet.term.decode('utf-8')

term = facet.term.decode('utf-8')
facet_dict[field_name].append((_from_xapian_value(term, field_type),
facet.termfreq))
return facet_dict
Expand Down Expand Up @@ -1517,43 +1515,37 @@ def _filter_gte(self, term, field_name, field_type, is_not):
Private method that returns a xapian.Query that searches for any term
that is greater than `term` in a specified `field`.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*')
pos = self.backend.column[field_name]
range_limit = _term_to_xapian_value(term, field_type)
result = xapian.Query(xapian.Query.OP_VALUE_GE, pos, range_limit)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
)
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
result = xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), result)
return result

def _filter_lte(self, term, field_name, field_type, is_not):
"""
Private method that returns a xapian.Query that searches for any term
that is less than `term` in a specified `field`.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type))
pos = self.backend.column[field_name]
range_limit = _term_to_xapian_value(term, field_type)
result = xapian.Query(xapian.Query.OP_VALUE_LE, pos, range_limit)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
)
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
result = xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), result)
return result

def _filter_range(self, term, field_name, field_type, is_not):
"""
Private method that returns a xapian.Query that searches for any term
that is between the values from the `term` list.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term[0], field_type)),
'%s' % _term_to_xapian_value(term[1], field_type))
pos = self.backend.column[field_name]
range_lower = _term_to_xapian_value(term[0], field_type)
range_upper = _term_to_xapian_value(term[1], field_type)
result = xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, range_lower, range_upper)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
)
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
result = xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), result)
return result


def _term_to_xapian_value(term, field_type):
Expand All @@ -1578,10 +1570,13 @@ def strf(dt):
else:
value = 'f'

elif field_type == 'integer':
value = INTEGER_FORMAT % term
elif field_type == 'float':
value = xapian.sortable_serialise(term)
elif field_type in ('float', 'integer'):
# up until 2**53 integers can be safely represented as doubles which
# is what sortable_serialise() deals with.
# sortable_serialise() returns bytes but we can only pass str to Query
# and others. So we encode the byte string as hex characters instead
# which have the same sort order as the actual bytes.
value = xapian.sortable_serialise(term).hex().lower()
elif field_type == 'date' or field_type == 'datetime':
if field_type == 'date':
# http://stackoverflow.com/a/1937636/931303 and comments
Expand Down Expand Up @@ -1616,10 +1611,9 @@ def _from_xapian_value(value, field_type):
return False
else:
InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value))
elif field_type == 'integer':
return int(value)
elif field_type == 'float':
return xapian.sortable_unserialise(value)
elif field_type in ('float', 'integer'):
result = xapian.sortable_unserialise(bytes.fromhex(value))
return int(result) if field_type == 'integer' else result
elif field_type == 'date' or field_type == 'datetime':
datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT)
if field_type == 'datetime':
Expand Down