12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613 |
- # Copyright 2007 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """
- Contains functions and classes related to fields.
- """
- import datetime, fnmatch, re, struct, sys
- from array import array
- from decimal import Decimal
- from whoosh import analysis, columns, formats
- from whoosh.compat import with_metaclass
- from whoosh.compat import itervalues, xrange
- from whoosh.compat import bytes_type, string_type, text_type
- from whoosh.system import emptybytes
- from whoosh.system import pack_byte, unpack_byte
- from whoosh.util.numeric import to_sortable, from_sortable
- from whoosh.util.numeric import typecode_max, NaN
- from whoosh.util.text import utf8encode, utf8decode
- from whoosh.util.times import datetime_to_long, long_to_datetime
- # Exceptions
- class FieldConfigurationError(Exception):
- pass
- class UnknownFieldError(Exception):
- pass
- # Field Types
- class FieldType(object):
- """
- Represents a field configuration.
- The FieldType object supports the following attributes:
- * format (formats.Format): the storage format for posting blocks.
- * analyzer (analysis.Analyzer): the analyzer to use to turn text into
- terms.
- * scorable (boolean): whether searches against this field may be scored.
- This controls whether the index stores per-document field lengths for
- this field.
- * stored (boolean): whether the content of this field is stored for each
- document. For example, in addition to indexing the title of a document,
- you usually want to store the title so it can be presented as part of
- the search results.
- * unique (boolean): whether this field's value is unique to each document.
- For example, 'path' or 'ID'. IndexWriter.update_document() will use
- fields marked as 'unique' to find the previous version of a document
- being updated.
- * multitoken_query is a string indicating what kind of query to use when
- a "word" in a user query parses into multiple tokens. The string is
- interpreted by the query parser. The strings understood by the default
- query parser are "first" (use first token only), "and" (join the tokens
- with an AND query), "or" (join the tokens with OR), "phrase" (join
- the tokens with a phrase query), and "default" (use the query parser's
- default join type).
- * vector (formats.Format or boolean): the format to use to store term
- vectors. If not a ``Format`` object, any true value means to use the
- index format as the term vector format. Any flase value means don't
- store term vectors for this field.
- The constructor for the base field type simply lets you supply your own
- attribute values. Subclasses may configure some or all of this for you.
- """
- analyzer = format = scorable = stored = unique = vector = None
- indexed = True
- multitoken_query = "default"
- sortable_typecode = None
- column_type = None
- def __init__(self, format, analyzer, scorable=False,
- stored=False, unique=False, multitoken_query="default",
- sortable=False, vector=None):
- self.format = format
- self.analyzer = analyzer
- self.scorable = scorable
- self.stored = stored
- self.unique = unique
- self.multitoken_query = multitoken_query
- self.set_sortable(sortable)
- if isinstance(vector, formats.Format):
- self.vector = vector
- elif vector:
- self.vector = self.format
- else:
- self.vector = None
- def __repr__(self):
- return ("%s(format=%r, scorable=%s, stored=%s, unique=%s)"
- % (self.__class__.__name__, self.format, self.scorable,
- self.stored, self.unique))
- def __eq__(self, other):
- return all((isinstance(other, FieldType),
- (self.format == other.format),
- (self.scorable == other.scorable),
- (self.stored == other.stored),
- (self.unique == other.unique),
- (self.column_type == other.column_type)))
- def __ne__(self, other):
- return not(self.__eq__(other))
- # Text
- def index(self, value, **kwargs):
- """Returns an iterator of (btext, frequency, weight, encoded_value)
- tuples for each unique word in the input value.
- The default implementation uses the ``analyzer`` attribute to tokenize
- the value into strings, then encodes them into bytes using UTF-8.
- """
- if not self.format:
- raise Exception("%s field %r cannot index without a format"
- % (self.__class__.__name__, self))
- if not isinstance(value, (text_type, list, tuple)):
- raise ValueError("%r is not unicode or sequence" % value)
- assert isinstance(self.format, formats.Format)
- if "mode" not in kwargs:
- kwargs["mode"] = "index"
- word_values = self.format.word_values
- ana = self.analyzer
- for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs):
- yield (utf8encode(tstring)[0], freq, wt, vbytes)
- def tokenize(self, value, **kwargs):
- """
- Analyzes the given string and returns an iterator of Token objects
- (note: for performance reasons, actually the same token yielded over
- and over with different attributes).
- """
- if not self.analyzer:
- raise Exception("%s field has no analyzer" % self.__class__)
- return self.analyzer(value, **kwargs)
- def process_text(self, qstring, mode='', **kwargs):
- """
- Analyzes the given string and returns an iterator of token texts.
- >>> field = fields.TEXT()
- >>> list(field.process_text("The ides of March"))
- ["ides", "march"]
- """
- if not self.format:
- raise Exception("%s field has no format" % self)
- return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs))
- # Conversion
- def to_bytes(self, value):
- """
- Returns a bytes representation of the given value, appropriate to be
- written to disk. The default implementation assumes a unicode value and
- encodes it using UTF-8.
- """
- if isinstance(value, (list, tuple)):
- value = value[0]
- if not isinstance(value, bytes_type):
- value = utf8encode(value)[0]
- return value
- def to_column_value(self, value):
- """
- Returns an object suitable to be inserted into the document values
- column for this field. The default implementation simply calls
- ``self.to_bytes(value)``.
- """
- return self.to_bytes(value)
- def from_bytes(self, bs):
- return utf8decode(bs)[0]
- def from_column_value(self, value):
- return self.from_bytes(value)
- # Columns/sorting
- def set_sortable(self, sortable):
- if sortable:
- if isinstance(sortable, columns.Column):
- self.column_type = sortable
- else:
- self.column_type = self.default_column()
- else:
- self.column_type = None
- def sortable_terms(self, ixreader, fieldname):
- """
- Returns an iterator of the "sortable" tokens in the given reader and
- field. These values can be used for sorting. The default implementation
- simply returns all tokens in the field.
- This can be overridden by field types such as NUMERIC where some values
- in a field are not useful for sorting.
- """
- return ixreader.lexicon(fieldname)
- def default_column(self):
- return columns.VarBytesColumn()
- # Parsing
- def self_parsing(self):
- """
- Subclasses should override this method to return True if they want
- the query parser to call the field's ``parse_query()`` method instead
- of running the analyzer on text in this field. This is useful where
- the field needs full control over how queries are interpreted, such
- as in the numeric field type.
- """
- return False
- def parse_query(self, fieldname, qstring, boost=1.0):
- """
- When ``self_parsing()`` returns True, the query parser will call
- this method to parse basic query text.
- """
- raise NotImplementedError(self.__class__.__name__)
- def parse_range(self, fieldname, start, end, startexcl, endexcl,
- boost=1.0):
- """
- When ``self_parsing()`` returns True, the query parser will call
- this method to parse range query text. If this method returns None
- instead of a query object, the parser will fall back to parsing the
- start and end terms using process_text().
- """
- return None
- # Spelling
- def separate_spelling(self):
- """
- Returns True if the field stores unstemmed words in a separate field for
- spelling suggestions.
- """
- return False
- def spelling_fieldname(self, fieldname):
- """
- Returns the name of a field to use for spelling suggestions instead of
- this field.
- :param fieldname: the name of this field.
- """
- return fieldname
- def spellable_words(self, value):
- """Returns an iterator of each unique word (in sorted order) in the
- input value, suitable for inclusion in the field's word graph.
- The default behavior is to call the field analyzer with the keyword
- argument ``no_morph=True``, which should make the analyzer skip any
- morphological transformation filters (e.g. stemming) to preserve the
- original form of the words. Exotic field types may need to override
- this behavior.
- """
- if isinstance(value, (list, tuple)):
- words = value
- else:
- words = [token.text for token
- in self.analyzer(value, no_morph=True)]
- return iter(sorted(set(words)))
- # Utility
- def subfields(self):
- """
- Returns an iterator of ``(name_prefix, fieldobject)`` pairs for the
- fields that need to be indexed when content is put in this field. The
- default implementation simply yields ``("", self)``.
- """
- yield "", self
- def supports(self, name):
- """
- Returns True if the underlying format supports the given posting
- value type.
- >>> field = TEXT()
- >>> field.supports("positions")
- True
- >>> field.supports("chars")
- False
- """
- return self.format.supports(name)
- def clean(self):
- """
- Clears any cached information in the field and any child objects.
- """
- if self.format and hasattr(self.format, "clean"):
- self.format.clean()
- # Events
- def on_add(self, schema, fieldname):
- pass
- def on_remove(self, schema, fieldname):
- pass
- # Wrapper base class
- class FieldWrapper(FieldType):
- def __init__(self, subfield, prefix):
- if isinstance(subfield, type):
- subfield = subfield()
- self.subfield = subfield
- self.name_prefix = prefix
- # By default we'll copy all the subfield's attributes -- override these
- # in subclass constructor for things you want to change
- self.analyzer = subfield.analyzer
- self.format = subfield.format
- self.column_type = subfield.column_type
- self.scorable = subfield.scorable
- self.stored = subfield.stored
- self.unique = subfield.unique
- self.indexed = subfield.indexed
- self.vector = subfield.vector
- def __eq__(self, other):
- return self.subfield.__eq__(other)
- def __ne__(self, other):
- return self.subfield.__ne__(other)
- # Text
- # def index(self, value, boost=1.0, **kwargs):
- # return self.subfield.index(value, boost, **kwargs)
- #
- # def tokenize(self, value, **kwargs):
- # return self.subfield.tokenize(value, **kwargs)
- #
- # def process_text(self, qstring, mode='', **kwargs):
- # return self.subfield.process_text(qstring, mode, **kwargs)
- # Conversion
- def to_bytes(self, value):
- return self.subfield.to_bytes(value)
- def to_column_value(self, value):
- return self.subfield.to_column_value(value)
- def from_bytes(self, bs):
- return self.subfield.from_bytes(bs)
- def from_column_value(self, value):
- return self.subfield.from_column_value(value)
- # Sorting/columns
- def set_sortable(self, sortable):
- self.subfield.set_sortable(sortable)
- def sortable_terms(self, ixreader, fieldname):
- return self.subfield.sortable_terms(ixreader, fieldname)
- def default_column(self):
- return self.subfield.default_column()
- # Parsing
- def self_parsing(self):
- return self.subfield.self_parsing()
- def parse_query(self, fieldname, qstring, boost=1.0):
- return self.subfield.parse_query(fieldname, qstring, boost)
- def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0):
- self.subfield.parse_range(fieldname, start, end, startexcl, endexcl,
- boost)
- # Utility
- def subfields(self):
- # The default FieldWrapper.subfields() implementation DOES NOT split
- # out the subfield here -- you need to override if that's what you want
- yield "", self
- def supports(self, name):
- return self.subfield.supports(name)
- def clean(self):
- self.subfield.clean()
- # Events
- def on_add(self, schema, fieldname):
- self.subfield.on_add(schema, fieldname)
- def on_remove(self, schema, fieldname):
- self.subfield.on_remove(schema, fieldname)
- # Pre-configured field types
- class ID(FieldType):
- """
- Configured field type that indexes the entire value of the field as one
- token. This is useful for data you don't want to tokenize, such as the path
- of a file.
- """
- def __init__(self, stored=False, unique=False, field_boost=1.0,
- sortable=False, analyzer=None):
- """
- :param stored: Whether the value of this field is stored with the
- document.
- """
- self.analyzer = analyzer or analysis.IDAnalyzer()
- # Don't store any information other than the doc ID
- self.format = formats.Existence(field_boost=field_boost)
- self.stored = stored
- self.unique = unique
- self.set_sortable(sortable)
- class IDLIST(FieldType):
- """
- Configured field type for fields containing IDs separated by whitespace
- and/or punctuation (or anything else, using the expression param).
- """
- def __init__(self, stored=False, unique=False, expression=None,
- field_boost=1.0):
- """
- :param stored: Whether the value of this field is stored with the
- document.
- :param unique: Whether the value of this field is unique per-document.
- :param expression: The regular expression object to use to extract
- tokens. The default expression breaks tokens on CRs, LFs, tabs,
- spaces, commas, and semicolons.
- """
- expression = expression or re.compile(r"[^\r\n\t ,;]+")
- self.analyzer = analysis.RegexAnalyzer(expression=expression)
- # Don't store any information other than the doc ID
- self.format = formats.Existence(field_boost=field_boost)
- self.stored = stored
- self.unique = unique
- class NUMERIC(FieldType):
- """
- Special field type that lets you index integer or floating point
- numbers in relatively short fixed-width terms. The field converts numbers
- to sortable bytes for you before indexing.
- You specify the numeric type of the field (``int`` or ``float``) when you
- create the ``NUMERIC`` object. The default is ``int``. For ``int``, you can
- specify a size in bits (``32`` or ``64``). For both ``int`` and ``float``
- you can specify a ``signed`` keyword argument (default is ``True``).
- >>> schema = Schema(path=STORED, position=NUMERIC(int, 64, signed=False))
- >>> ix = storage.create_index(schema)
- >>> with ix.writer() as w:
- ... w.add_document(path="/a", position=5820402204)
- ...
- You can also use the NUMERIC field to store Decimal instances by specifying
- a type of ``int`` or ``long`` and the ``decimal_places`` keyword argument.
- This simply multiplies each number by ``(10 ** decimal_places)`` before
- storing it as an integer. Of course this may throw away decimal prcesision
- (by truncating, not rounding) and imposes the same maximum value limits as
- ``int``/``long``, but these may be acceptable for certain applications.
- >>> from decimal import Decimal
- >>> schema = Schema(path=STORED, position=NUMERIC(int, decimal_places=4))
- >>> ix = storage.create_index(schema)
- >>> with ix.writer() as w:
- ... w.add_document(path="/a", position=Decimal("123.45")
- ...
- """
- def __init__(self, numtype=int, bits=32, stored=False, unique=False,
- field_boost=1.0, decimal_places=0, shift_step=4, signed=True,
- sortable=False, default=None):
- """
- :param numtype: the type of numbers that can be stored in this field,
- either ``int``, ``float``. If you use ``Decimal``,
- use the ``decimal_places`` argument to control how many decimal
- places the field will store.
- :param bits: When ``numtype`` is ``int``, the number of bits to use to
- store the number: 8, 16, 32, or 64.
- :param stored: Whether the value of this field is stored with the
- document.
- :param unique: Whether the value of this field is unique per-document.
- :param decimal_places: specifies the number of decimal places to save
- when storing Decimal instances. If you set this, you will always
- get Decimal instances back from the field.
- :param shift_steps: The number of bits of precision to shift away at
- each tiered indexing level. Values should generally be 1-8. Lower
- values yield faster searches but take up more space. A value
- of `0` means no tiered indexing.
- :param signed: Whether the numbers stored in this field may be
- negative.
- """
- # Allow users to specify strings instead of Python types in case
- # docstring isn't clear
- if numtype == "int":
- numtype = int
- if numtype == "float":
- numtype = float
- # Raise an error if the user tries to use a type other than int or
- # float
- if numtype is Decimal:
- numtype = int
- if not decimal_places:
- raise TypeError("To store Decimal instances, you must set the "
- "decimal_places argument")
- elif numtype not in (int, float):
- raise TypeError("Can't use %r as a type, use int or float"
- % numtype)
- # Sanity check
- if numtype is float and decimal_places:
- raise Exception("A float type and decimal_places argument %r are "
- "incompatible" % decimal_places)
- intsizes = [8, 16, 32, 64]
- intcodes = ["B", "H", "I", "Q"]
- # Set up field configuration based on type and size
- if numtype is float:
- bits = 64 # Floats are converted to 64 bit ints
- else:
- if bits not in intsizes:
- raise Exception("Invalid bits %r, use 8, 16, 32, or 64"
- % bits)
- # Type code for the *sortable* representation
- self.sortable_typecode = intcodes[intsizes.index(bits)]
- self._struct = struct.Struct(">" + str(self.sortable_typecode))
- self.numtype = numtype
- self.bits = bits
- self.stored = stored
- self.unique = unique
- self.decimal_places = decimal_places
- self.shift_step = shift_step
- self.signed = signed
- self.analyzer = analysis.IDAnalyzer()
- # Don't store any information other than the doc ID
- self.format = formats.Existence(field_boost=field_boost)
- self.min_value, self.max_value = self._min_max()
- # Column configuration
- if default is None:
- if numtype is int:
- default = typecode_max[self.sortable_typecode]
- else:
- default = NaN
- elif not self.is_valid(default):
- raise Exception("The default %r is not a valid number for this "
- "field" % default)
- self.default = default
- self.set_sortable(sortable)
- def __getstate__(self):
- d = self.__dict__.copy()
- if "_struct" in d:
- del d["_struct"]
- return d
- def __setstate__(self, d):
- self.__dict__.update(d)
- self._struct = struct.Struct(">" + str(self.sortable_typecode))
- if "min_value" not in d:
- d["min_value"], d["max_value"] = self._min_max()
- def _min_max(self):
- numtype = self.numtype
- bits = self.bits
- signed = self.signed
- # Calculate the minimum and maximum possible values for error checking
- min_value = from_sortable(numtype, bits, signed, 0)
- max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1)
- return min_value, max_value
- def default_column(self):
- return columns.NumericColumn(self.sortable_typecode,
- default=self.default)
- def is_valid(self, x):
- try:
- x = self.to_bytes(x)
- except ValueError:
- return False
- except OverflowError:
- return False
- return True
- def index(self, num, **kwargs):
- # If the user gave us a list of numbers, recurse on the list
- if isinstance(num, (list, tuple)):
- for n in num:
- for item in self.index(n):
- yield item
- return
- # word, freq, weight, valuestring
- if self.shift_step:
- for shift in xrange(0, self.bits, self.shift_step):
- yield (self.to_bytes(num, shift), 1, 1.0, emptybytes)
- else:
- yield (self.to_bytes(num), 1, 1.0, emptybytes)
- def prepare_number(self, x):
- if x == emptybytes or x is None:
- return x
- dc = self.decimal_places
- if dc and isinstance(x, (string_type, Decimal)):
- x = Decimal(x) * (10 ** dc)
- elif isinstance(x, Decimal):
- raise TypeError("Can't index a Decimal object unless you specified "
- "decimal_places on the field")
- try:
- x = self.numtype(x)
- except OverflowError:
- raise ValueError("Value %r overflowed number type %r"
- % (x, self.numtype))
- if x < self.min_value or x > self.max_value:
- raise ValueError("Numeric field value %s out of range [%s, %s]"
- % (x, self.min_value, self.max_value))
- return x
- def unprepare_number(self, x):
- dc = self.decimal_places
- if dc:
- s = str(x)
- x = Decimal(s[:-dc] + "." + s[-dc:])
- return x
- def to_column_value(self, x):
- if isinstance(x, (list, tuple, array)):
- x = x[0]
- x = self.prepare_number(x)
- return to_sortable(self.numtype, self.bits, self.signed, x)
- def from_column_value(self, x):
- x = from_sortable(self.numtype, self.bits, self.signed, x)
- return self.unprepare_number(x)
- def to_bytes(self, x, shift=0):
- # Try to avoid re-encoding; this sucks because on Python 2 we can't
- # tell the difference between a string and encoded bytes, so we have
- # to require the user use unicode when they mean string
- if isinstance(x, bytes_type):
- return x
- if x == emptybytes or x is None:
- return self.sortable_to_bytes(0)
- x = self.prepare_number(x)
- x = to_sortable(self.numtype, self.bits, self.signed, x)
- return self.sortable_to_bytes(x, shift)
- def sortable_to_bytes(self, x, shift=0):
- if shift:
- x >>= shift
- return pack_byte(shift) + self._struct.pack(x)
- def from_bytes(self, bs):
- x = self._struct.unpack(bs[1:])[0]
- x = from_sortable(self.numtype, self.bits, self.signed, x)
- x = self.unprepare_number(x)
- return x
- def process_text(self, text, **kwargs):
- return (self.to_bytes(text),)
- def self_parsing(self):
- return True
- def parse_query(self, fieldname, qstring, boost=1.0):
- from whoosh import query
- from whoosh.qparser.common import QueryParserError
- if qstring == "*":
- return query.Every(fieldname, boost=boost)
- if not self.is_valid(qstring):
- raise QueryParserError("%r is not a valid number" % qstring)
- token = self.to_bytes(qstring)
- return query.Term(fieldname, token, boost=boost)
- def parse_range(self, fieldname, start, end, startexcl, endexcl,
- boost=1.0):
- from whoosh import query
- from whoosh.qparser.common import QueryParserError
- if start is not None:
- if not self.is_valid(start):
- raise QueryParserError("Range start %r is not a valid number"
- % start)
- start = self.prepare_number(start)
- if end is not None:
- if not self.is_valid(end):
- raise QueryParserError("Range end %r is not a valid number"
- % end)
- end = self.prepare_number(end)
- return query.NumericRange(fieldname, start, end, startexcl, endexcl,
- boost=boost)
- def sortable_terms(self, ixreader, fieldname):
- zero = b"\x00"
- for token in ixreader.lexicon(fieldname):
- if token[0:1] != zero:
- # Only yield the full-precision values
- break
- yield token
- class DATETIME(NUMERIC):
- """
- Special field type that lets you index datetime objects. The field
- converts the datetime objects to sortable text for you before indexing.
- Since this field is based on Python's datetime module it shares all the
- limitations of that module, such as the inability to represent dates before
- year 1 in the proleptic Gregorian calendar. However, since this field
- stores datetimes as an integer number of microseconds, it could easily
- represent a much wider range of dates if the Python datetime implementation
- ever supports them.
- >>> schema = Schema(path=STORED, date=DATETIME)
- >>> ix = storage.create_index(schema)
- >>> w = ix.writer()
- >>> w.add_document(path="/a", date=datetime.now())
- >>> w.commit()
- """
- def __init__(self, stored=False, unique=False, sortable=False):
- """
- :param stored: Whether the value of this field is stored with the
- document.
- :param unique: Whether the value of this field is unique per-document.
- """
- super(DATETIME, self).__init__(int, 64, stored=stored,
- unique=unique, shift_step=8,
- sortable=sortable)
- def prepare_datetime(self, x):
- from whoosh.util.times import floor
- if isinstance(x, text_type):
- # For indexing, support same strings as for query parsing --
- # convert unicode to datetime object
- x = self._parse_datestring(x)
- x = floor(x) # this makes most sense (unspecified = lowest)
- if isinstance(x, datetime.datetime):
- return datetime_to_long(x)
- elif isinstance(x, bytes_type):
- return x
- else:
- raise Exception("%r is not a datetime" % (x,))
- def to_column_value(self, x):
- if isinstance(x, bytes_type):
- raise Exception("%r is not a datetime" % (x,))
- if isinstance(x, (list, tuple)):
- x = x[0]
- return self.prepare_datetime(x)
- def from_column_value(self, x):
- return long_to_datetime(x)
- def to_bytes(self, x, shift=0):
- x = self.prepare_datetime(x)
- return NUMERIC.to_bytes(self, x, shift=shift)
- def from_bytes(self, bs):
- x = NUMERIC.from_bytes(self, bs)
- return long_to_datetime(x)
- def _parse_datestring(self, qstring):
- # This method parses a very simple datetime representation of the form
- # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]]
- from whoosh.util.times import adatetime, fix, is_void
- qstring = qstring.replace(" ", "").replace("-", "").replace(".", "")
- year = month = day = hour = minute = second = microsecond = None
- if len(qstring) >= 4:
- year = int(qstring[:4])
- if len(qstring) >= 6:
- month = int(qstring[4:6])
- if len(qstring) >= 8:
- day = int(qstring[6:8])
- if len(qstring) >= 10:
- hour = int(qstring[8:10])
- if len(qstring) >= 12:
- minute = int(qstring[10:12])
- if len(qstring) >= 14:
- second = int(qstring[12:14])
- if len(qstring) == 20:
- microsecond = int(qstring[14:])
- at = fix(adatetime(year, month, day, hour, minute, second,
- microsecond))
- if is_void(at):
- raise Exception("%r is not a parseable date" % qstring)
- return at
- def parse_query(self, fieldname, qstring, boost=1.0):
- from whoosh import query
- from whoosh.util.times import is_ambiguous
- try:
- at = self._parse_datestring(qstring)
- except:
- e = sys.exc_info()[1]
- return query.error_query(e)
- if is_ambiguous(at):
- startnum = datetime_to_long(at.floor())
- endnum = datetime_to_long(at.ceil())
- return query.NumericRange(fieldname, startnum, endnum)
- else:
- return query.Term(fieldname, at, boost=boost)
- def parse_range(self, fieldname, start, end, startexcl, endexcl,
- boost=1.0):
- from whoosh import query
- if start is None and end is None:
- return query.Every(fieldname, boost=boost)
- if start is not None:
- startdt = self._parse_datestring(start).floor()
- start = datetime_to_long(startdt)
- if end is not None:
- enddt = self._parse_datestring(end).ceil()
- end = datetime_to_long(enddt)
- return query.NumericRange(fieldname, start, end, boost=boost)
- class BOOLEAN(FieldType):
- """
- Special field type that lets you index boolean values (True and False).
- The field converts the boolean values to text for you before indexing.
- >>> schema = Schema(path=STORED, done=BOOLEAN)
- >>> ix = storage.create_index(schema)
- >>> w = ix.writer()
- >>> w.add_document(path="/a", done=False)
- >>> w.commit()
- """
- bytestrings = (b"f", b"t")
- trues = frozenset(u"t true yes 1".split())
- falses = frozenset(u"f false no 0".split())
- def __init__(self, stored=False, field_boost=1.0):
- """
- :param stored: Whether the value of this field is stored with the
- document.
- """
- self.stored = stored
- # Don't store any information other than the doc ID
- self.format = formats.Existence(field_boost=field_boost)
- def _obj_to_bool(self, x):
- # We special case strings such as "true", "false", "yes", "no", but
- # otherwise call bool() on the query value. This lets you pass objects
- # as query values and do the right thing.
- if isinstance(x, string_type) and x.lower() in self.trues:
- x = True
- elif isinstance(x, string_type) and x.lower() in self.falses:
- x = False
- else:
- x = bool(x)
- return x
- def to_bytes(self, x):
- if isinstance(x, bytes_type):
- return x
- elif isinstance(x, string_type):
- x = x.lower() in self.trues
- else:
- x = bool(x)
- bs = self.bytestrings[int(x)]
- return bs
- def index(self, bit, **kwargs):
- if isinstance(bit, string_type):
- bit = bit.lower() in self.trues
- else:
- bit = bool(bit)
- # word, freq, weight, valuestring
- return [(self.bytestrings[int(bit)], 1, 1.0, emptybytes)]
- def self_parsing(self):
- return True
- def parse_query(self, fieldname, qstring, boost=1.0):
- from whoosh import query
- if qstring == "*":
- return query.Every(fieldname, boost=boost)
- return query.Term(fieldname, self._obj_to_bool(qstring), boost=boost)
- class STORED(FieldType):
- """
- Configured field type for fields you want to store but not index.
- """
- indexed = False
- stored = True
- def __init__(self):
- pass
- class COLUMN(FieldType):
- """
- Configured field type for fields you want to store as a per-document
- value column but not index.
- """
- indexed = False
- stored = False
- def __init__(self, columnobj=None):
- if columnobj is None:
- columnobj = columns.VarBytesColumn()
- if not isinstance(columnobj, columns.Column):
- raise TypeError("%r is not a column object" % (columnobj,))
- self.column_type = columnobj
- def to_bytes(self, v):
- return v
- def from_bytes(self, b):
- return b
- class KEYWORD(FieldType):
- """
- Configured field type for fields containing space-separated or
- comma-separated keyword-like data (such as tags). The default is to not
- store positional information (so phrase searching is not allowed in this
- field) and to not make the field scorable.
- """
- def __init__(self, stored=False, lowercase=False, commas=False,
- scorable=False, unique=False, field_boost=1.0, sortable=False,
- vector=None, analyzer=None):
- """
- :param stored: Whether to store the value of the field with the
- document.
- :param commas: Whether this is a comma-separated field. If this is False
- (the default), it is treated as a space-separated field.
- :param scorable: Whether this field is scorable.
- """
- if not analyzer:
- analyzer = analysis.KeywordAnalyzer(lowercase=lowercase,
- commas=commas)
- self.analyzer = analyzer
- # Store field lengths and weights along with doc ID
- self.format = formats.Frequency(field_boost=field_boost)
- self.scorable = scorable
- self.stored = stored
- self.unique = unique
- if isinstance(vector, formats.Format):
- self.vector = vector
- elif vector:
- self.vector = self.format
- else:
- self.vector = None
- if sortable:
- self.column_type = self.default_column()
- class TEXT(FieldType):
- """
- Configured field type for text fields (for example, the body text of an
- article). The default is to store positional information to allow phrase
- searching. This field type is always scorable.
- """
- def __init__(self, analyzer=None, phrase=True, chars=False, stored=False,
- field_boost=1.0, multitoken_query="default", spelling=False,
- sortable=False, lang=None, vector=None,
- spelling_prefix="spell_"):
- """
- :param analyzer: The analysis.Analyzer to use to index the field
- contents. See the analysis module for more information. If you omit
- this argument, the field uses analysis.StandardAnalyzer.
- :param phrase: Whether the store positional information to allow phrase
- searching.
- :param chars: Whether to store character ranges along with positions.
- If this is True, "phrase" is also implied.
- :param stored: Whether to store the value of this field with the
- document. Since this field type generally contains a lot of text,
- you should avoid storing it with the document unless you need to,
- for example to allow fast excerpts in the search results.
- :param spelling: if True, and if the field's analyzer changes the form
- of term text (such as a stemming analyzer), this field will store
- extra information in a separate field (named using the
- ``spelling_prefix`` keyword argument) to allow spelling suggestions
- to use the unchanged word forms as spelling suggestions.
- :param sortable: If True, make this field sortable using the default
- column type. If you pass a :class:`whoosh.columns.Column` instance
- instead of True, the field will use the given column type.
- :param lang: automaticaly configure a
- :class:`whoosh.analysis.LanguageAnalyzer` for the given language.
- This is ignored if you also specify an ``analyzer``.
- :param vector: if this value evaluates to true, store a list of the
- terms in this field in each document. If the value is an instance
- of :class:`whoosh.formats.Format`, the index will use the object to
- store the term vector. Any other true value (e.g. ``vector=True``)
- will use the field's index format to store the term vector as well.
- """
- if analyzer:
- self.analyzer = analyzer
- elif lang:
- self.analyzer = analysis.LanguageAnalyzer(lang)
- else:
- self.analyzer = analysis.StandardAnalyzer()
- if chars:
- formatclass = formats.Characters
- elif phrase:
- formatclass = formats.Positions
- else:
- formatclass = formats.Frequency
- self.format = formatclass(field_boost=field_boost)
- if sortable:
- if isinstance(sortable, columns.Column):
- self.column_type = sortable
- else:
- self.column_type = columns.VarBytesColumn()
- else:
- self.column_type = None
- self.spelling = spelling
- self.spelling_prefix = spelling_prefix
- self.multitoken_query = multitoken_query
- self.scorable = True
- self.stored = stored
- if isinstance(vector, formats.Format):
- self.vector = vector
- elif vector:
- self.vector = self.format
- else:
- self.vector = None
- def subfields(self):
- yield "", self
- # If the user indicated this is a spellable field, and the analyzer
- # is morphic, then also index into a spelling-only field that stores
- # minimal information
- if self.separate_spelling():
- yield self.spelling_prefix, SpellField(self.analyzer)
- def separate_spelling(self):
- return self.spelling and self.analyzer.has_morph()
- def spelling_fieldname(self, fieldname):
- if self.separate_spelling():
- return self.spelling_prefix + fieldname
- else:
- return fieldname
- class SpellField(FieldType):
- """
- This is a utility field type meant to be returned by ``TEXT.subfields()``
- when it needs a minimal field to store the spellable words.
- """
- def __init__(self, analyzer):
- self.format = formats.Frequency()
- self.analyzer = analyzer
- self.column_type = None
- self.scorabe = False
- self.stored = False
- self.unique = False
- self.indexed = True
- self.spelling = False
- # All the text analysis methods add "nomorph" to the keywords to get
- # unmorphed term texts
- def index(self, value, boost=1.0, **kwargs):
- kwargs["nomorph"] = True
- return FieldType.index(self, value, boost=boost, **kwargs)
- def tokenzie(self, value, **kwargs):
- kwargs["nomorph"] = True
- return FieldType.tokenize(self, value, **kwargs)
- def process_text(self, qstring, mode='', **kwargs):
- kwargs["nomorph"] = True
- return FieldType.process_text(self, qstring, mode=mode, **kwargs)
- class NGRAM(FieldType):
- """
- Configured field that indexes text as N-grams. For example, with a field
- type NGRAM(3,4), the value "hello" will be indexed as tokens
- "hel", "hell", "ell", "ello", "llo". This field type chops the entire text
- into N-grams, including whitespace and punctuation. See :class:`NGRAMWORDS`
- for a field type that breaks the text into words first before chopping the
- words into N-grams.
- """
- scorable = True
- def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0,
- queryor=False, phrase=False, sortable=False):
- """
- :param minsize: The minimum length of the N-grams.
- :param maxsize: The maximum length of the N-grams.
- :param stored: Whether to store the value of this field with the
- document. Since this field type generally contains a lot of text,
- you should avoid storing it with the document unless you need to,
- for example to allow fast excerpts in the search results.
- :param queryor: if True, combine the N-grams with an Or query. The
- default is to combine N-grams with an And query.
- :param phrase: store positions on the N-grams to allow exact phrase
- searching. The default is off.
- """
- formatclass = formats.Frequency
- if phrase:
- formatclass = formats.Positions
- self.analyzer = analysis.NgramAnalyzer(minsize, maxsize)
- self.format = formatclass(field_boost=field_boost)
- self.analyzer = analysis.NgramAnalyzer(minsize, maxsize)
- self.stored = stored
- self.queryor = queryor
- self.set_sortable(sortable)
- def self_parsing(self):
- return True
- def parse_query(self, fieldname, qstring, boost=1.0):
- from whoosh import query
- terms = [query.Term(fieldname, g)
- for g in self.process_text(qstring, mode='query')]
- cls = query.Or if self.queryor else query.And
- return cls(terms, boost=boost)
- class NGRAMWORDS(NGRAM):
- """
- Configured field that chops text into words using a tokenizer,
- lowercases the words, and then chops the words into N-grams.
- """
- scorable = True
- def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0,
- tokenizer=None, at=None, queryor=False, sortable=False):
- """
- :param minsize: The minimum length of the N-grams.
- :param maxsize: The maximum length of the N-grams.
- :param stored: Whether to store the value of this field with the
- document. Since this field type generally contains a lot of text,
- you should avoid storing it with the document unless you need to,
- for example to allow fast excerpts in the search results.
- :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer`
- used to break the text into words.
- :param at: if 'start', only takes N-grams from the start of the word.
- If 'end', only takes N-grams from the end. Otherwise the default
- is to take all N-grams from each word.
- :param queryor: if True, combine the N-grams with an Or query. The
- default is to combine N-grams with an And query.
- """
- self.analyzer = analysis.NgramWordAnalyzer(minsize, maxsize, tokenizer,
- at=at)
- self.format = formats.Frequency(field_boost=field_boost)
- self.stored = stored
- self.queryor = queryor
- self.set_sortable(sortable)
- # Other fields
- class ReverseField(FieldWrapper):
- def __init__(self, subfield, prefix="rev_"):
- FieldWrapper.__init__(self, subfield, prefix)
- self.analyzer = subfield.analyzer | analysis.ReverseTextFilter()
- self.format = BasicFormat(lengths=False, weights=False)
- self.scorable = False
- self.set_sortable(False)
- self.stored = False
- self.unique = False
- self.vector = False
- def subfields(self):
- yield "", self.subfield
- yield self.name_prefix, self
- # Schema class
- class MetaSchema(type):
- def __new__(cls, name, bases, attrs):
- super_new = super(MetaSchema, cls).__new__
- if not any(b for b in bases if isinstance(b, MetaSchema)):
- # If this isn't a subclass of MetaSchema, don't do anything special
- return super_new(cls, name, bases, attrs)
- # Create the class
- special_attrs = {}
- for key in list(attrs.keys()):
- if key.startswith("__"):
- special_attrs[key] = attrs.pop(key)
- new_class = super_new(cls, name, bases, special_attrs)
- fields = {}
- for b in bases:
- if hasattr(b, "_clsfields"):
- fields.update(b._clsfields)
- fields.update(attrs)
- new_class._clsfields = fields
- return new_class
- def schema(self):
- return Schema(**self._clsfields)
- class Schema(object):
- """
- Represents the collection of fields in an index. Maps field names to
- FieldType objects which define the behavior of each field.
- Low-level parts of the index use field numbers instead of field names for
- compactness. This class has several methods for converting between the
- field name, field number, and field object itself.
- """
- def __init__(self, **fields):
- """
- All keyword arguments to the constructor are treated as fieldname =
- fieldtype pairs. The fieldtype can be an instantiated FieldType object,
- or a FieldType sub-class (in which case the Schema will instantiate it
- with the default constructor before adding it).
- For example::
- s = Schema(content = TEXT,
- title = TEXT(stored = True),
- tags = KEYWORD(stored = True))
- """
- self._fields = {}
- self._subfields = {}
- self._dyn_fields = {}
- for name in sorted(fields.keys()):
- self.add(name, fields[name])
- def copy(self):
- """
- Returns a shallow copy of the schema. The field instances are not
- deep copied, so they are shared between schema copies.
- """
- return self.__class__(**self._fields)
- def __eq__(self, other):
- return (other.__class__ is self.__class__
- and list(self.items()) == list(other.items()))
- def __ne__(self, other):
- return not(self.__eq__(other))
- def __repr__(self):
- return "<%s: %r>" % (self.__class__.__name__, self.names())
- def __iter__(self):
- """
- Returns the field objects in this schema.
- """
- return iter(self._fields.values())
- def __getitem__(self, name):
- """
- Returns the field associated with the given field name.
- """
- # If the name is in the dictionary, just return it
- if name in self._fields:
- return self._fields[name]
- # Check if the name matches a dynamic field
- for expr, fieldtype in itervalues(self._dyn_fields):
- if expr.match(name):
- return fieldtype
- raise KeyError("No field named %r" % (name,))
- def __len__(self):
- """
- Returns the number of fields in this schema.
- """
- return len(self._fields)
- def __contains__(self, fieldname):
- """
- Returns True if a field by the given name is in this schema.
- """
- # Defined in terms of __getitem__ so that there's only one method to
- # override to provide dynamic fields
- try:
- field = self[fieldname]
- return field is not None
- except KeyError:
- return False
- def __setstate__(self, state):
- if "_subfields" not in state:
- state["_subfields"] = {}
- self.__dict__.update(state)
- def to_bytes(self, fieldname, value):
- return self[fieldname].to_bytes(value)
- def items(self):
- """
- Returns a list of ("fieldname", field_object) pairs for the fields
- in this schema.
- """
- return sorted(self._fields.items())
- def names(self, check_names=None):
- """
- Returns a list of the names of the fields in this schema.
- :param check_names: (optional) sequence of field names to check
- whether the schema accepts them as (dynamic) field names -
- acceptable names will also be in the result list.
- Note: You may also have static field names in check_names, that
- won't create duplicates in the result list. Unsupported names
- will not be in the result list.
- """
- fieldnames = set(self._fields.keys())
- if check_names is not None:
- check_names = set(check_names) - fieldnames
- fieldnames.update(fieldname for fieldname in check_names
- if fieldname in self)
- return sorted(fieldnames)
- def clean(self):
- for field in self:
- field.clean()
- def add(self, name, fieldtype, glob=False):
- """
- Adds a field to this schema.
- :param name: The name of the field.
- :param fieldtype: An instantiated fields.FieldType object, or a
- FieldType subclass. If you pass an instantiated object, the schema
- will use that as the field configuration for this field. If you
- pass a FieldType subclass, the schema will automatically
- instantiate it with the default constructor.
- """
- # If the user passed a type rather than an instantiated field object,
- # instantiate it automatically
- if type(fieldtype) is type:
- try:
- fieldtype = fieldtype()
- except:
- e = sys.exc_info()[1]
- raise FieldConfigurationError("Error: %s instantiating field "
- "%r: %r" % (e, name, fieldtype))
- if not isinstance(fieldtype, FieldType):
- raise FieldConfigurationError("%r is not a FieldType object"
- % fieldtype)
- self._subfields[name] = sublist = []
- for prefix, subfield in fieldtype.subfields():
- fname = prefix + name
- sublist.append(fname)
- # Check field name
- if fname.startswith("_"):
- raise FieldConfigurationError("Names cannot start with _")
- elif " " in fname:
- raise FieldConfigurationError("Names cannot contain spaces")
- elif fname in self._fields or (glob and fname in self._dyn_fields):
- raise FieldConfigurationError("%r already in schema" % fname)
- # Add the field
- if glob:
- expr = re.compile(fnmatch.translate(name))
- self._dyn_fields[fname] = (expr, subfield)
- else:
- fieldtype.on_add(self, fname)
- self._fields[fname] = subfield
- def remove(self, fieldname):
- if fieldname in self._fields:
- self._fields[fieldname].on_remove(self, fieldname)
- del self._fields[fieldname]
- if fieldname in self._subfields:
- for subname in self._subfields[fieldname]:
- if subname in self._fields:
- del self._fields[subname]
- del self._subfields[fieldname]
- elif fieldname in self._dyn_fields:
- del self._dyn_fields[fieldname]
- else:
- raise KeyError("No field named %r" % fieldname)
- def indexable_fields(self, fieldname):
- if fieldname in self._subfields:
- for subname in self._subfields[fieldname]:
- yield subname, self._fields[subname]
- else:
- # Use __getitem__ here instead of getting it directly from _fields
- # because it might be a glob
- yield fieldname, self[fieldname]
- def has_scorable_fields(self):
- return any(ftype.scorable for ftype in self)
- def stored_names(self):
- """
- Returns a list of the names of fields that are stored.
- """
- return [name for name, field in self.items() if field.stored]
- def scorable_names(self):
- """
- Returns a list of the names of fields that store field
- lengths.
- """
- return [name for name, field in self.items() if field.scorable]
- class SchemaClass(with_metaclass(MetaSchema, Schema)):
- """
- Allows you to define a schema using declarative syntax, similar to
- Django models::
- class MySchema(SchemaClass):
- path = ID
- date = DATETIME
- content = TEXT
- You can use inheritance to share common fields between schemas::
- class Parent(SchemaClass):
- path = ID(stored=True)
- date = DATETIME
- class Child1(Parent):
- content = TEXT(positions=False)
- class Child2(Parent):
- tags = KEYWORD
- This class overrides ``__new__`` so instantiating your sub-class always
- results in an instance of ``Schema``.
- >>> class MySchema(SchemaClass):
- ... title = TEXT(stored=True)
- ... content = TEXT
- ...
- >>> s = MySchema()
- >>> type(s)
- <class 'whoosh.fields.Schema'>
-
- """
- def __new__(cls, *args, **kwargs):
- obj = super(Schema, cls).__new__(Schema)
- kw = getattr(cls, "_clsfields", {})
- kw.update(kwargs)
- obj.__init__(*args, **kw)
- return obj
- def ensure_schema(schema):
- if isinstance(schema, type) and issubclass(schema, Schema):
- schema = schema.schema()
- if not isinstance(schema, Schema):
- raise FieldConfigurationError("%r is not a Schema" % schema)
- return schema
- def merge_fielddict(d1, d2):
- keyset = set(d1.keys()) | set(d2.keys())
- out = {}
- for name in keyset:
- field1 = d1.get(name)
- field2 = d2.get(name)
- if field1 and field2 and field1 != field2:
- raise Exception("Inconsistent field %r: %r != %r"
- % (name, field1, field2))
- out[name] = field1 or field2
- return out
- def merge_schema(s1, s2):
- schema = Schema()
- schema._fields = merge_fielddict(s1._fields, s2._fields)
- schema._dyn_fields = merge_fielddict(s1._dyn_fields, s2._dyn_fields)
- return schema
- def merge_schemas(schemas):
- schema = schemas[0]
- for i in xrange(1, len(schemas)):
- schema = merge_schema(schema, schemas[i])
- return schema
|