diff --git a/tests/xapian_tests/migrations/0001_initial.py b/tests/xapian_tests/migrations/0001_initial.py index 3977e95..f185c9e 100644 --- a/tests/xapian_tests/migrations/0001_initial.py +++ b/tests/xapian_tests/migrations/0001_initial.py @@ -18,6 +18,7 @@ class Migration(migrations.Migration): ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('type_name', models.CharField(max_length=50)), ('number', models.IntegerField()), + ('float_number', models.FloatField()), ('name', models.CharField(max_length=200)), ('date', models.DateField()), ('summary', models.TextField()), diff --git a/tests/xapian_tests/models.py b/tests/xapian_tests/models.py index 496c6b4..2fc1512 100644 --- a/tests/xapian_tests/models.py +++ b/tests/xapian_tests/models.py @@ -7,6 +7,7 @@ class Document(models.Model): type_name = models.CharField(max_length=50) number = models.IntegerField() + float_number = models.FloatField() name = models.CharField(max_length=200) date = models.DateField() diff --git a/tests/xapian_tests/search_indexes.py b/tests/xapian_tests/search_indexes.py index 7a80c81..9c9f2ce 100644 --- a/tests/xapian_tests/search_indexes.py +++ b/tests/xapian_tests/search_indexes.py @@ -10,6 +10,7 @@ class DocumentIndex(indexes.SearchIndex, indexes.Indexable): type_name = indexes.CharField(model_attr='type_name') number = indexes.IntegerField(model_attr='number') + float_number = indexes.FloatField(model_attr='float_number') name = indexes.CharField(model_attr='name') date = indexes.DateField(model_attr='date') diff --git a/tests/xapian_tests/tests/test_backend.py b/tests/xapian_tests/tests/test_backend.py index c0e4c1e..e99a9b3 100644 --- a/tests/xapian_tests/tests/test_backend.py +++ b/tests/xapian_tests/tests/test_backend.py @@ -559,9 +559,10 @@ def test_verify_type(self): def test_term_to_xapian_value(self): self.assertEqual(_term_to_xapian_value('abc', 'text'), 'abc') - self.assertEqual(_term_to_xapian_value(1, 'integer'), '000000000001') - self.assertEqual(_term_to_xapian_value(2653, 'integer'), '000000002653') - self.assertEqual(_term_to_xapian_value(25.5, 'float'), b'\xb2`') + self.assertEqual(_term_to_xapian_value(-1337, 'integer'), '316380') + self.assertEqual(_term_to_xapian_value(1, 'integer'), 'a0') + self.assertEqual(_term_to_xapian_value(2653, 'integer'), 'd12e80') + self.assertEqual(_term_to_xapian_value(25.5, 'float'), 'b260') self.assertEqual(_term_to_xapian_value([1, 2, 3], 'text'), '[1, 2, 3]') self.assertEqual(_term_to_xapian_value((1, 2, 3), 'text'), '(1, 2, 3)') self.assertEqual(_term_to_xapian_value({'a': 1, 'c': 3, 'b': 2}, 'text'), @@ -627,18 +628,18 @@ def test_parse_query_range(self): ]) self.assertExpectedQuery(self.backend.parse_query('number:0..10'), [ - '0 * VALUE_RANGE 11 000000000000 000000000010', - 'VALUE_RANGE 11 000000000000 000000000010', + '0 * VALUE_RANGE 11 80 ad', + 'VALUE_RANGE 11 80 ad', ]) self.assertExpectedQuery(self.backend.parse_query('number:..10'), [ - '0 * VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1), - 'VALUE_RANGE 11 %012d 000000000010' % (-sys.maxsize - 1), + '0 * VALUE_LE 11 ad', + 'VALUE_LE 11 ad', ]) self.assertExpectedQuery(self.backend.parse_query('number:10..*'), [ - '0 * VALUE_RANGE 11 000000000010 %012d' % sys.maxsize, - 'VALUE_RANGE 11 000000000010 %012d' % sys.maxsize, + '0 * VALUE_RANGE 11 ad ffffffffffffffffff', + 'VALUE_RANGE 11 ad ffffffffffffffffff', ]) def test_order_by_django_id(self): diff --git a/tests/xapian_tests/tests/test_interface.py b/tests/xapian_tests/tests/test_interface.py index 8dcb1d8..fe52983 100644 --- a/tests/xapian_tests/tests/test_interface.py +++ b/tests/xapian_tests/tests/test_interface.py @@ -38,8 +38,9 @@ def setUp(self): for i in range(1, 13): doc = Document() doc.type_name = types_names[i % 3] - doc.number = i * 2 - doc.name = "%s %d" % (doc.type_name, doc.number) + doc.number = (i - 7) * 100 + doc.float_number = (i - 6.5) * 2 + doc.name = "%s %d" % (doc.type_name, i * 2) doc.date = dates[i % 3] doc.summary = summaries[i % 3] @@ -158,6 +159,15 @@ def test_value_range(self): self.assertEqual(set(pks(self.queryset.filter(number__lt=3))), set(pks(Document.objects.filter(number__lt=3)))) + self.assertEqual(set(pks(self.queryset.filter(float_number__lte=float('inf')))), + set(pks(Document.objects.filter(float_number__lte=float('inf'))))) + self.assertEqual(set(pks(self.queryset.filter(float_number__lt=3.5))), + set(pks(Document.objects.filter(float_number__lt=3.5)))) + self.assertEqual(set(pks(self.queryset.filter(float_number__gt=3.5))), + set(pks(Document.objects.filter(float_number__gt=3.5)))) + self.assertEqual(set(pks(self.queryset.filter(float_number__gte=float('-inf')))), + set(pks(Document.objects.filter(float_number__gte=float('-inf'))))) + self.assertEqual(set(pks(self.queryset.filter(django_id__gte=6))), set(pks(Document.objects.filter(id__gte=6)))) @@ -178,6 +188,8 @@ def test_order_by(self): # value order self.assertEqual(pks(self.queryset.order_by("number")), pks(Document.objects.order_by("number"))) + self.assertEqual(pks(self.queryset.order_by("float_number")), + pks(Document.objects.order_by("float_number"))) # text order self.assertEqual(pks(self.queryset.order_by("summary")), diff --git a/tests/xapian_tests/tests/test_query.py b/tests/xapian_tests/tests/test_query.py index 491a956..78468ec 100644 --- a/tests/xapian_tests/tests/test_query.py +++ b/tests/xapian_tests/tests/test_query.py @@ -278,34 +278,29 @@ def test_endswith(self): def test_gt(self): self.sq.add_filter(SQ(name__gt='m')) self.assertExpectedQuery(self.sq.build_query(), - '( AND_NOT VALUE_RANGE 3 a m)') + '( AND_NOT VALUE_LE 3 m)') def test_gte(self): self.sq.add_filter(SQ(name__gte='m')) self.assertExpectedQuery(self.sq.build_query(), - 'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzz') + 'VALUE_GE 3 m') def test_lt(self): self.sq.add_filter(SQ(name__lt='m')) self.assertExpectedQuery(self.sq.build_query(), - '( AND_NOT VALUE_RANGE 3 m ' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzz)') + '( AND_NOT VALUE_GE 3 m)') def test_lte(self): self.sq.add_filter(SQ(name__lte='m')) - self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 3 a m') + self.assertExpectedQuery(self.sq.build_query(), 'VALUE_LE 3 m') def test_range(self): self.sq.add_filter(SQ(django_id__range=[2, 4])) - self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 1 000000000002 000000000004') + self.assertExpectedQuery(self.sq.build_query(), 'VALUE_RANGE 1 a4 a8') self.sq.add_filter(~SQ(django_id__range=[0, 2])) self.assertExpectedQuery(self.sq.build_query(), - '(VALUE_RANGE 1 000000000002 000000000004 AND ' - '( AND_NOT VALUE_RANGE 1 000000000000 000000000002))') + '(VALUE_RANGE 1 a4 a8 AND ' + '( AND_NOT VALUE_RANGE 1 80 a4))') self.assertEqual([result.pk for result in self.sq.get_results()], [3]) def test_multiple_filter_types(self): @@ -315,13 +310,11 @@ def test_multiple_filter_types(self): self.sq.add_filter(SQ(title__gte='B')) self.sq.add_filter(SQ(django_id__in=[1, 2, 3])) self.assertExpectedQuery(self.sq.build_query(), - '((Zwhi OR why) AND' - ' VALUE_RANGE 5 00010101000000 20090210015900 AND' - ' ( AND_NOT VALUE_RANGE 3 a david)' - ' AND VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzz AND' - ' (QQ000000000001 OR QQ000000000002 OR QQ000000000003))') + '((Zwhi OR why) AND ' + 'VALUE_LE 5 20090210015900 AND ' + '( AND_NOT VALUE_LE 3 david) AND ' + 'VALUE_GE 7 b AND ' + '(QQa0 OR QQa4 OR QQa6))') def test_log_query(self): reset_search_queries() diff --git a/xapian_backend.py b/xapian_backend.py index 0b15ec7..c39aaad 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -67,7 +67,6 @@ # defines the format used to store types in Xapian # this format ensures datetimes are sorted correctly DATETIME_FORMAT = '%Y%m%d%H%M%S' -INTEGER_FORMAT = '%012d' # defines the distance given between # texts with positional information @@ -102,31 +101,36 @@ def __call__(self, begin, end): if field_dict['field_name'] == field_name: field_type = field_dict['type'] - if not begin: + if begin: + if field_type == 'float': + begin = _term_to_xapian_value(float(begin), field_type) + elif field_type == 'integer': + begin = _term_to_xapian_value(int(begin), field_type) + else: if field_type == 'text': begin = 'a' # TODO: A better way of getting a min text value? - elif field_type == 'integer': - begin = -sys.maxsize - 1 - elif field_type == 'float': - begin = float('-inf') + elif field_type in ('float', 'integer'): + # floats and ints are both serialised using xapian.sortable_serialise + # so we can use -Infinity as the lower bound for both of them. + begin = _term_to_xapian_value(float('-inf'), field_type) elif field_type == 'date' or field_type == 'datetime': begin = '00010101000000' - elif end == '*': + + if end == '*': if field_type == 'text': end = 'z' * 100 # TODO: A better way of getting a max text value? - elif field_type == 'integer': - end = sys.maxsize - elif field_type == 'float': - end = float('inf') + elif field_type in ('float', 'integer'): + # floats and ints are both serialised using xapian.sortable_serialise + # so we can use +Infinity as the upper bound for both of them. + end = _term_to_xapian_value(float('inf'), field_type) elif field_type == 'date' or field_type == 'datetime': end = '99990101000000' + else: + if field_type == 'float': + end = _term_to_xapian_value(float(end), field_type) + elif field_type == 'integer': + end = _term_to_xapian_value(int(end), field_type) - if field_type == 'float': - begin = _term_to_xapian_value(float(begin), field_type) - end = _term_to_xapian_value(float(end), field_type) - elif field_type == 'integer': - begin = _term_to_xapian_value(int(begin), field_type) - end = _term_to_xapian_value(int(end), field_type) return field_dict['column'], str(begin), str(end) @@ -942,13 +946,7 @@ def _process_facet_field_spies(self, spies): facet_dict[field_name] = [] for facet in list(spy.values()): - if field_type == 'float': - # the float term is a Xapian serialized object, which is - # in bytes. - term = facet.term - else: - term = facet.term.decode('utf-8') - + term = facet.term.decode('utf-8') facet_dict[field_name].append((_from_xapian_value(term, field_type), facet.termfreq)) return facet_dict @@ -1517,43 +1515,37 @@ def _filter_gte(self, term, field_name, field_type, is_not): Private method that returns a xapian.Query that searches for any term that is greater than `term` in a specified `field`. """ - vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*') + pos = self.backend.column[field_name] + range_limit = _term_to_xapian_value(term, field_type) + result = xapian.Query(xapian.Query.OP_VALUE_GE, pos, range_limit) if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, - self._all_query(), - xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - ) - return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) + result = xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), result) + return result def _filter_lte(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is less than `term` in a specified `field`. """ - vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type)) + pos = self.backend.column[field_name] + range_limit = _term_to_xapian_value(term, field_type) + result = xapian.Query(xapian.Query.OP_VALUE_LE, pos, range_limit) if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, - self._all_query(), - xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - ) - return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) + result = xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), result) + return result def _filter_range(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is between the values from the `term` list. """ - vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term[0], field_type)), - '%s' % _term_to_xapian_value(term[1], field_type)) + pos = self.backend.column[field_name] + range_lower = _term_to_xapian_value(term[0], field_type) + range_upper = _term_to_xapian_value(term[1], field_type) + result = xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, range_lower, range_upper) if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, - self._all_query(), - xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - ) - return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) + result = xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), result) + return result def _term_to_xapian_value(term, field_type): @@ -1578,10 +1570,13 @@ def strf(dt): else: value = 'f' - elif field_type == 'integer': - value = INTEGER_FORMAT % term - elif field_type == 'float': - value = xapian.sortable_serialise(term) + elif field_type in ('float', 'integer'): + # up until 2**53 integers can be safely represented as doubles which + # is what sortable_serialise() deals with. + # sortable_serialise() returns bytes but we can only pass str to Query + # and others. So we encode the byte string as hex characters instead + # which have the same sort order as the actual bytes. + value = xapian.sortable_serialise(term).hex().lower() elif field_type == 'date' or field_type == 'datetime': if field_type == 'date': # http://stackoverflow.com/a/1937636/931303 and comments @@ -1616,10 +1611,9 @@ def _from_xapian_value(value, field_type): return False else: InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value)) - elif field_type == 'integer': - return int(value) - elif field_type == 'float': - return xapian.sortable_unserialise(value) + elif field_type in ('float', 'integer'): + result = xapian.sortable_unserialise(bytes.fromhex(value)) + return int(result) if field_type == 'integer' else result elif field_type == 'date' or field_type == 'datetime': datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT) if field_type == 'datetime':