12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274 |
- # Copyright 2009-present MongoDB, Inc.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """BSON (Binary JSON) encoding and decoding.
- The mapping from Python types to BSON types is as follows:
- ======================================= ============= ===================
- Python Type BSON Type Supported Direction
- ======================================= ============= ===================
- None null both
- bool boolean both
- int [#int]_ int32 / int64 py -> bson
- long int64 py -> bson
- `bson.int64.Int64` int64 both
- float number (real) both
- string string py -> bson
- unicode string both
- list array both
- dict / `SON` object both
- datetime.datetime [#dt]_ [#dt2]_ date both
- `bson.regex.Regex` regex both
- compiled re [#re]_ regex py -> bson
- `bson.binary.Binary` binary both
- `bson.objectid.ObjectId` oid both
- `bson.dbref.DBRef` dbref both
- None undefined bson -> py
- unicode code bson -> py
- `bson.code.Code` code py -> bson
- unicode symbol bson -> py
- bytes (Python 3) [#bytes]_ binary both
- ======================================= ============= ===================
- Note that, when using Python 2.x, to save binary data it must be wrapped as
- an instance of `bson.binary.Binary`. Otherwise it will be saved as a BSON
- string and retrieved as unicode. Users of Python 3.x can use the Python bytes
- type.
- .. [#int] A Python int will be saved as a BSON int32 or BSON int64 depending
- on its size. A BSON int32 will always decode to a Python int. A BSON
- int64 will always decode to a :class:`~bson.int64.Int64`.
- .. [#dt] datetime.datetime instances will be rounded to the nearest
- millisecond when saved
- .. [#dt2] all datetime.datetime instances are treated as *naive*. clients
- should always use UTC.
- .. [#re] :class:`~bson.regex.Regex` instances and regular expression
- objects from ``re.compile()`` are both saved as BSON regular expressions.
- BSON regular expressions are decoded as :class:`~bson.regex.Regex`
- instances.
- .. [#bytes] The bytes type from Python 3.x is encoded as BSON binary with
- subtype 0. In Python 3.x it will be decoded back to bytes. In Python 2.x
- it will be decoded to an instance of :class:`~bson.binary.Binary` with
- subtype 0.
- """
- import calendar
- import datetime
- import itertools
- import platform
- import re
- import struct
- import sys
- import uuid
- from codecs import (utf_8_decode as _utf_8_decode,
- utf_8_encode as _utf_8_encode)
- from bson.binary import (Binary, UuidRepresentation, ALL_UUID_SUBTYPES,
- OLD_UUID_SUBTYPE,
- JAVA_LEGACY, CSHARP_LEGACY,
- UUIDLegacy, UUID_SUBTYPE)
- from bson.code import Code
- from bson.codec_options import (
- CodecOptions, DEFAULT_CODEC_OPTIONS, _raw_document_class)
- from bson.dbref import DBRef
- from bson.decimal128 import Decimal128
- from bson.errors import (InvalidBSON,
- InvalidDocument,
- InvalidStringData)
- from bson.int64 import Int64
- from bson.max_key import MaxKey
- from bson.min_key import MinKey
- from bson.objectid import ObjectId
- from bson.py3compat import (abc,
- b,
- PY3,
- iteritems,
- text_type,
- string_type,
- reraise)
- from bson.regex import Regex
- from bson.son import SON, RE_TYPE
- from bson.timestamp import Timestamp
- from bson.tz_util import utc
- try:
- from bson import _cbson
- _USE_C = True
- except ImportError:
- _USE_C = False
- EPOCH_AWARE = datetime.datetime.fromtimestamp(0, utc)
- EPOCH_NAIVE = datetime.datetime.utcfromtimestamp(0)
- BSONNUM = b"\x01" # Floating point
- BSONSTR = b"\x02" # UTF-8 string
- BSONOBJ = b"\x03" # Embedded document
- BSONARR = b"\x04" # Array
- BSONBIN = b"\x05" # Binary
- BSONUND = b"\x06" # Undefined
- BSONOID = b"\x07" # ObjectId
- BSONBOO = b"\x08" # Boolean
- BSONDAT = b"\x09" # UTC Datetime
- BSONNUL = b"\x0A" # Null
- BSONRGX = b"\x0B" # Regex
- BSONREF = b"\x0C" # DBRef
- BSONCOD = b"\x0D" # Javascript code
- BSONSYM = b"\x0E" # Symbol
- BSONCWS = b"\x0F" # Javascript code with scope
- BSONINT = b"\x10" # 32bit int
- BSONTIM = b"\x11" # Timestamp
- BSONLON = b"\x12" # 64bit int
- BSONDEC = b"\x13" # Decimal128
- BSONMIN = b"\xFF" # Min key
- BSONMAX = b"\x7F" # Max key
- _UNPACK_FLOAT_FROM = struct.Struct("<d").unpack_from
- _UNPACK_INT = struct.Struct("<i").unpack
- _UNPACK_INT_FROM = struct.Struct("<i").unpack_from
- _UNPACK_LENGTH_SUBTYPE_FROM = struct.Struct("<iB").unpack_from
- _UNPACK_LONG_FROM = struct.Struct("<q").unpack_from
- _UNPACK_TIMESTAMP_FROM = struct.Struct("<II").unpack_from
- if PY3:
- _OBJEND = 0
- # Only used to generate the _ELEMENT_GETTER dict
- def _maybe_ord(element_type):
- return ord(element_type)
- # Only used in _raise_unkown_type below
- def _elt_to_hex(element_type):
- return chr(element_type).encode()
- _supported_buffer_types = (bytes, bytearray)
- else:
- _OBJEND = b"\x00"
- def _maybe_ord(element_type):
- return element_type
- def _elt_to_hex(element_type):
- return element_type
- _supported_buffer_types = (bytes,)
- if platform.python_implementation() == 'Jython':
- # This is why we can't have nice things.
- # https://bugs.jython.org/issue2788
- def get_data_and_view(data):
- if isinstance(data, _supported_buffer_types):
- return data, data
- data = memoryview(data).tobytes()
- return data, data
- else:
- def get_data_and_view(data):
- if isinstance(data, _supported_buffer_types):
- return data, memoryview(data)
- view = memoryview(data)
- return view.tobytes(), view
- def _raise_unknown_type(element_type, element_name):
- """Unknown type helper."""
- raise InvalidBSON("Detected unknown BSON type %r for fieldname '%s'. Are "
- "you using the latest driver version?" % (
- _elt_to_hex(element_type), element_name))
- def _get_int(data, view, position, dummy0, dummy1, dummy2):
- """Decode a BSON int32 to python int."""
- return _UNPACK_INT_FROM(data, position)[0], position + 4
- def _get_c_string(data, view, position, opts):
- """Decode a BSON 'C' string to python unicode string."""
- end = data.index(b"\x00", position)
- return _utf_8_decode(view[position:end],
- opts.unicode_decode_error_handler, True)[0], end + 1
- def _get_float(data, view, position, dummy0, dummy1, dummy2):
- """Decode a BSON double to python float."""
- return _UNPACK_FLOAT_FROM(data, position)[0], position + 8
- def _get_string(data, view, position, obj_end, opts, dummy):
- """Decode a BSON string to python unicode string."""
- length = _UNPACK_INT_FROM(data, position)[0]
- position += 4
- if length < 1 or obj_end - position < length:
- raise InvalidBSON("invalid string length")
- end = position + length - 1
- if data[end] != _OBJEND:
- raise InvalidBSON("invalid end of string")
- return _utf_8_decode(view[position:end],
- opts.unicode_decode_error_handler, True)[0], end + 1
- def _get_object_size(data, position, obj_end):
- """Validate and return a BSON document's size."""
- try:
- obj_size = _UNPACK_INT_FROM(data, position)[0]
- except struct.error as exc:
- raise InvalidBSON(str(exc))
- end = position + obj_size - 1
- if data[end] != _OBJEND:
- raise InvalidBSON("bad eoo")
- if end >= obj_end:
- raise InvalidBSON("invalid object length")
- # If this is the top-level document, validate the total size too.
- if position == 0 and obj_size != obj_end:
- raise InvalidBSON("invalid object length")
- return obj_size, end
- def _get_object(data, view, position, obj_end, opts, dummy):
- """Decode a BSON subdocument to opts.document_class or bson.dbref.DBRef."""
- obj_size, end = _get_object_size(data, position, obj_end)
- if _raw_document_class(opts.document_class):
- return (opts.document_class(data[position:end + 1], opts),
- position + obj_size)
- obj = _elements_to_dict(data, view, position + 4, end, opts)
- position += obj_size
- if "$ref" in obj:
- return (DBRef(obj.pop("$ref"), obj.pop("$id", None),
- obj.pop("$db", None), obj), position)
- return obj, position
- def _get_array(data, view, position, obj_end, opts, element_name):
- """Decode a BSON array to python list."""
- size = _UNPACK_INT_FROM(data, position)[0]
- end = position + size - 1
- if data[end] != _OBJEND:
- raise InvalidBSON("bad eoo")
- position += 4
- end -= 1
- result = []
- # Avoid doing global and attribute lookups in the loop.
- append = result.append
- index = data.index
- getter = _ELEMENT_GETTER
- decoder_map = opts.type_registry._decoder_map
- while position < end:
- element_type = data[position]
- # Just skip the keys.
- position = index(b'\x00', position) + 1
- try:
- value, position = getter[element_type](
- data, view, position, obj_end, opts, element_name)
- except KeyError:
- _raise_unknown_type(element_type, element_name)
- if decoder_map:
- custom_decoder = decoder_map.get(type(value))
- if custom_decoder is not None:
- value = custom_decoder(value)
- append(value)
- if position != end + 1:
- raise InvalidBSON('bad array length')
- return result, position + 1
- def _get_binary(data, view, position, obj_end, opts, dummy1):
- """Decode a BSON binary to bson.binary.Binary or python UUID."""
- length, subtype = _UNPACK_LENGTH_SUBTYPE_FROM(data, position)
- position += 5
- if subtype == 2:
- length2 = _UNPACK_INT_FROM(data, position)[0]
- position += 4
- if length2 != length - 4:
- raise InvalidBSON("invalid binary (st 2) - lengths don't match!")
- length = length2
- end = position + length
- if length < 0 or end > obj_end:
- raise InvalidBSON('bad binary object length')
- # Convert UUID subtypes to native UUIDs.
- # TODO: PYTHON-2245 Decoding should follow UUID spec in PyMongo 4.0+
- if subtype in ALL_UUID_SUBTYPES:
- uuid_representation = opts.uuid_representation
- binary_value = Binary(data[position:end], subtype)
- if uuid_representation == UuidRepresentation.UNSPECIFIED:
- return binary_value, end
- if subtype == UUID_SUBTYPE:
- # Legacy behavior: use STANDARD with binary subtype 4.
- uuid_representation = UuidRepresentation.STANDARD
- elif uuid_representation == UuidRepresentation.STANDARD:
- # subtype == OLD_UUID_SUBTYPE
- # Legacy behavior: STANDARD is the same as PYTHON_LEGACY.
- uuid_representation = UuidRepresentation.PYTHON_LEGACY
- return binary_value.as_uuid(uuid_representation), end
- # Python3 special case. Decode subtype 0 to 'bytes'.
- if PY3 and subtype == 0:
- value = data[position:end]
- else:
- value = Binary(data[position:end], subtype)
- return value, end
- def _get_oid(data, view, position, dummy0, dummy1, dummy2):
- """Decode a BSON ObjectId to bson.objectid.ObjectId."""
- end = position + 12
- return ObjectId(data[position:end]), end
- def _get_boolean(data, view, position, dummy0, dummy1, dummy2):
- """Decode a BSON true/false to python True/False."""
- end = position + 1
- boolean_byte = data[position:end]
- if boolean_byte == b'\x00':
- return False, end
- elif boolean_byte == b'\x01':
- return True, end
- raise InvalidBSON('invalid boolean value: %r' % boolean_byte)
- def _get_date(data, view, position, dummy0, opts, dummy1):
- """Decode a BSON datetime to python datetime.datetime."""
- return _millis_to_datetime(
- _UNPACK_LONG_FROM(data, position)[0], opts), position + 8
- def _get_code(data, view, position, obj_end, opts, element_name):
- """Decode a BSON code to bson.code.Code."""
- code, position = _get_string(data, view, position, obj_end, opts, element_name)
- return Code(code), position
- def _get_code_w_scope(data, view, position, obj_end, opts, element_name):
- """Decode a BSON code_w_scope to bson.code.Code."""
- code_end = position + _UNPACK_INT_FROM(data, position)[0]
- code, position = _get_string(
- data, view, position + 4, code_end, opts, element_name)
- scope, position = _get_object(data, view, position, code_end, opts, element_name)
- if position != code_end:
- raise InvalidBSON('scope outside of javascript code boundaries')
- return Code(code, scope), position
- def _get_regex(data, view, position, dummy0, opts, dummy1):
- """Decode a BSON regex to bson.regex.Regex or a python pattern object."""
- pattern, position = _get_c_string(data, view, position, opts)
- bson_flags, position = _get_c_string(data, view, position, opts)
- bson_re = Regex(pattern, bson_flags)
- return bson_re, position
- def _get_ref(data, view, position, obj_end, opts, element_name):
- """Decode (deprecated) BSON DBPointer to bson.dbref.DBRef."""
- collection, position = _get_string(
- data, view, position, obj_end, opts, element_name)
- oid, position = _get_oid(data, view, position, obj_end, opts, element_name)
- return DBRef(collection, oid), position
- def _get_timestamp(data, view, position, dummy0, dummy1, dummy2):
- """Decode a BSON timestamp to bson.timestamp.Timestamp."""
- inc, timestamp = _UNPACK_TIMESTAMP_FROM(data, position)
- return Timestamp(timestamp, inc), position + 8
- def _get_int64(data, view, position, dummy0, dummy1, dummy2):
- """Decode a BSON int64 to bson.int64.Int64."""
- return Int64(_UNPACK_LONG_FROM(data, position)[0]), position + 8
- def _get_decimal128(data, view, position, dummy0, dummy1, dummy2):
- """Decode a BSON decimal128 to bson.decimal128.Decimal128."""
- end = position + 16
- return Decimal128.from_bid(data[position:end]), end
- # Each decoder function's signature is:
- # - data: bytes
- # - view: memoryview that references `data`
- # - position: int, beginning of object in 'data' to decode
- # - obj_end: int, end of object to decode in 'data' if variable-length type
- # - opts: a CodecOptions
- _ELEMENT_GETTER = {
- _maybe_ord(BSONNUM): _get_float,
- _maybe_ord(BSONSTR): _get_string,
- _maybe_ord(BSONOBJ): _get_object,
- _maybe_ord(BSONARR): _get_array,
- _maybe_ord(BSONBIN): _get_binary,
- _maybe_ord(BSONUND): lambda u, v, w, x, y, z: (None, w), # Deprecated undefined
- _maybe_ord(BSONOID): _get_oid,
- _maybe_ord(BSONBOO): _get_boolean,
- _maybe_ord(BSONDAT): _get_date,
- _maybe_ord(BSONNUL): lambda u, v, w, x, y, z: (None, w),
- _maybe_ord(BSONRGX): _get_regex,
- _maybe_ord(BSONREF): _get_ref, # Deprecated DBPointer
- _maybe_ord(BSONCOD): _get_code,
- _maybe_ord(BSONSYM): _get_string, # Deprecated symbol
- _maybe_ord(BSONCWS): _get_code_w_scope,
- _maybe_ord(BSONINT): _get_int,
- _maybe_ord(BSONTIM): _get_timestamp,
- _maybe_ord(BSONLON): _get_int64,
- _maybe_ord(BSONDEC): _get_decimal128,
- _maybe_ord(BSONMIN): lambda u, v, w, x, y, z: (MinKey(), w),
- _maybe_ord(BSONMAX): lambda u, v, w, x, y, z: (MaxKey(), w)}
- if _USE_C:
- def _element_to_dict(data, view, position, obj_end, opts):
- return _cbson._element_to_dict(data, position, obj_end, opts)
- else:
- def _element_to_dict(data, view, position, obj_end, opts):
- """Decode a single key, value pair."""
- element_type = data[position]
- position += 1
- element_name, position = _get_c_string(data, view, position, opts)
- try:
- value, position = _ELEMENT_GETTER[element_type](data, view, position,
- obj_end, opts,
- element_name)
- except KeyError:
- _raise_unknown_type(element_type, element_name)
- if opts.type_registry._decoder_map:
- custom_decoder = opts.type_registry._decoder_map.get(type(value))
- if custom_decoder is not None:
- value = custom_decoder(value)
- return element_name, value, position
- def _raw_to_dict(data, position, obj_end, opts, result):
- data, view = get_data_and_view(data)
- return _elements_to_dict(data, view, position, obj_end, opts, result)
- def _elements_to_dict(data, view, position, obj_end, opts, result=None):
- """Decode a BSON document into result."""
- if result is None:
- result = opts.document_class()
- end = obj_end - 1
- while position < end:
- key, value, position = _element_to_dict(data, view, position, obj_end, opts)
- result[key] = value
- if position != obj_end:
- raise InvalidBSON('bad object or element length')
- return result
- def _bson_to_dict(data, opts):
- """Decode a BSON string to document_class."""
- data, view = get_data_and_view(data)
- try:
- if _raw_document_class(opts.document_class):
- return opts.document_class(data, opts)
- _, end = _get_object_size(data, 0, len(data))
- return _elements_to_dict(data, view, 4, end, opts)
- except InvalidBSON:
- raise
- except Exception:
- # Change exception type to InvalidBSON but preserve traceback.
- _, exc_value, exc_tb = sys.exc_info()
- reraise(InvalidBSON, exc_value, exc_tb)
- if _USE_C:
- _bson_to_dict = _cbson._bson_to_dict
- _PACK_FLOAT = struct.Struct("<d").pack
- _PACK_INT = struct.Struct("<i").pack
- _PACK_LENGTH_SUBTYPE = struct.Struct("<iB").pack
- _PACK_LONG = struct.Struct("<q").pack
- _PACK_TIMESTAMP = struct.Struct("<II").pack
- _LIST_NAMES = tuple(b(str(i)) + b"\x00" for i in range(1000))
- def gen_list_name():
- """Generate "keys" for encoded lists in the sequence
- b"0\x00", b"1\x00", b"2\x00", ...
- The first 1000 keys are returned from a pre-built cache. All
- subsequent keys are generated on the fly.
- """
- for name in _LIST_NAMES:
- yield name
- counter = itertools.count(1000)
- while True:
- yield b(str(next(counter))) + b"\x00"
- def _make_c_string_check(string):
- """Make a 'C' string, checking for embedded NUL characters."""
- if isinstance(string, bytes):
- if b"\x00" in string:
- raise InvalidDocument("BSON keys / regex patterns must not "
- "contain a NUL character")
- try:
- _utf_8_decode(string, None, True)
- return string + b"\x00"
- except UnicodeError:
- raise InvalidStringData("strings in documents must be valid "
- "UTF-8: %r" % string)
- else:
- if "\x00" in string:
- raise InvalidDocument("BSON keys / regex patterns must not "
- "contain a NUL character")
- return _utf_8_encode(string)[0] + b"\x00"
- def _make_c_string(string):
- """Make a 'C' string."""
- if isinstance(string, bytes):
- try:
- _utf_8_decode(string, None, True)
- return string + b"\x00"
- except UnicodeError:
- raise InvalidStringData("strings in documents must be valid "
- "UTF-8: %r" % string)
- else:
- return _utf_8_encode(string)[0] + b"\x00"
- if PY3:
- def _make_name(string):
- """Make a 'C' string suitable for a BSON key."""
- # Keys can only be text in python 3.
- if "\x00" in string:
- raise InvalidDocument("BSON keys / regex patterns must not "
- "contain a NUL character")
- return _utf_8_encode(string)[0] + b"\x00"
- else:
- # Keys can be unicode or bytes in python 2.
- _make_name = _make_c_string_check
- def _encode_float(name, value, dummy0, dummy1):
- """Encode a float."""
- return b"\x01" + name + _PACK_FLOAT(value)
- if PY3:
- def _encode_bytes(name, value, dummy0, dummy1):
- """Encode a python bytes."""
- # Python3 special case. Store 'bytes' as BSON binary subtype 0.
- return b"\x05" + name + _PACK_INT(len(value)) + b"\x00" + value
- else:
- def _encode_bytes(name, value, dummy0, dummy1):
- """Encode a python str (python 2.x)."""
- try:
- _utf_8_decode(value, None, True)
- except UnicodeError:
- raise InvalidStringData("strings in documents must be valid "
- "UTF-8: %r" % (value,))
- return b"\x02" + name + _PACK_INT(len(value) + 1) + value + b"\x00"
- def _encode_mapping(name, value, check_keys, opts):
- """Encode a mapping type."""
- if _raw_document_class(value):
- return b'\x03' + name + value.raw
- data = b"".join([_element_to_bson(key, val, check_keys, opts)
- for key, val in iteritems(value)])
- return b"\x03" + name + _PACK_INT(len(data) + 5) + data + b"\x00"
- def _encode_dbref(name, value, check_keys, opts):
- """Encode bson.dbref.DBRef."""
- buf = bytearray(b"\x03" + name + b"\x00\x00\x00\x00")
- begin = len(buf) - 4
- buf += _name_value_to_bson(b"$ref\x00",
- value.collection, check_keys, opts)
- buf += _name_value_to_bson(b"$id\x00",
- value.id, check_keys, opts)
- if value.database is not None:
- buf += _name_value_to_bson(
- b"$db\x00", value.database, check_keys, opts)
- for key, val in iteritems(value._DBRef__kwargs):
- buf += _element_to_bson(key, val, check_keys, opts)
- buf += b"\x00"
- buf[begin:begin + 4] = _PACK_INT(len(buf) - begin)
- return bytes(buf)
- def _encode_list(name, value, check_keys, opts):
- """Encode a list/tuple."""
- lname = gen_list_name()
- data = b"".join([_name_value_to_bson(next(lname), item,
- check_keys, opts)
- for item in value])
- return b"\x04" + name + _PACK_INT(len(data) + 5) + data + b"\x00"
- def _encode_text(name, value, dummy0, dummy1):
- """Encode a python unicode (python 2.x) / str (python 3.x)."""
- value = _utf_8_encode(value)[0]
- return b"\x02" + name + _PACK_INT(len(value) + 1) + value + b"\x00"
- def _encode_binary(name, value, dummy0, dummy1):
- """Encode bson.binary.Binary."""
- subtype = value.subtype
- if subtype == 2:
- value = _PACK_INT(len(value)) + value
- return b"\x05" + name + _PACK_LENGTH_SUBTYPE(len(value), subtype) + value
- def _encode_uuid(name, value, dummy, opts):
- """Encode uuid.UUID."""
- uuid_representation = opts.uuid_representation
- binval = Binary.from_uuid(value, uuid_representation=uuid_representation)
- return _encode_binary(name, binval, dummy, opts)
- def _encode_objectid(name, value, dummy0, dummy1):
- """Encode bson.objectid.ObjectId."""
- return b"\x07" + name + value.binary
- def _encode_bool(name, value, dummy0, dummy1):
- """Encode a python boolean (True/False)."""
- return b"\x08" + name + (value and b"\x01" or b"\x00")
- def _encode_datetime(name, value, dummy0, dummy1):
- """Encode datetime.datetime."""
- millis = _datetime_to_millis(value)
- return b"\x09" + name + _PACK_LONG(millis)
- def _encode_none(name, dummy0, dummy1, dummy2):
- """Encode python None."""
- return b"\x0A" + name
- def _encode_regex(name, value, dummy0, dummy1):
- """Encode a python regex or bson.regex.Regex."""
- flags = value.flags
- # Python 2 common case
- if flags == 0:
- return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
- # Python 3 common case
- elif flags == re.UNICODE:
- return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
- else:
- sflags = b""
- if flags & re.IGNORECASE:
- sflags += b"i"
- if flags & re.LOCALE:
- sflags += b"l"
- if flags & re.MULTILINE:
- sflags += b"m"
- if flags & re.DOTALL:
- sflags += b"s"
- if flags & re.UNICODE:
- sflags += b"u"
- if flags & re.VERBOSE:
- sflags += b"x"
- sflags += b"\x00"
- return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
- def _encode_code(name, value, dummy, opts):
- """Encode bson.code.Code."""
- cstring = _make_c_string(value)
- cstrlen = len(cstring)
- if value.scope is None:
- return b"\x0D" + name + _PACK_INT(cstrlen) + cstring
- scope = _dict_to_bson(value.scope, False, opts, False)
- full_length = _PACK_INT(8 + cstrlen + len(scope))
- return b"\x0F" + name + full_length + _PACK_INT(cstrlen) + cstring + scope
- def _encode_int(name, value, dummy0, dummy1):
- """Encode a python int."""
- if -2147483648 <= value <= 2147483647:
- return b"\x10" + name + _PACK_INT(value)
- else:
- try:
- return b"\x12" + name + _PACK_LONG(value)
- except struct.error:
- raise OverflowError("BSON can only handle up to 8-byte ints")
- def _encode_timestamp(name, value, dummy0, dummy1):
- """Encode bson.timestamp.Timestamp."""
- return b"\x11" + name + _PACK_TIMESTAMP(value.inc, value.time)
- def _encode_long(name, value, dummy0, dummy1):
- """Encode a python long (python 2.x)"""
- try:
- return b"\x12" + name + _PACK_LONG(value)
- except struct.error:
- raise OverflowError("BSON can only handle up to 8-byte ints")
- def _encode_decimal128(name, value, dummy0, dummy1):
- """Encode bson.decimal128.Decimal128."""
- return b"\x13" + name + value.bid
- def _encode_minkey(name, dummy0, dummy1, dummy2):
- """Encode bson.min_key.MinKey."""
- return b"\xFF" + name
- def _encode_maxkey(name, dummy0, dummy1, dummy2):
- """Encode bson.max_key.MaxKey."""
- return b"\x7F" + name
- # Each encoder function's signature is:
- # - name: utf-8 bytes
- # - value: a Python data type, e.g. a Python int for _encode_int
- # - check_keys: bool, whether to check for invalid names
- # - opts: a CodecOptions
- _ENCODERS = {
- bool: _encode_bool,
- bytes: _encode_bytes,
- datetime.datetime: _encode_datetime,
- dict: _encode_mapping,
- float: _encode_float,
- int: _encode_int,
- list: _encode_list,
- # unicode in py2, str in py3
- text_type: _encode_text,
- tuple: _encode_list,
- type(None): _encode_none,
- uuid.UUID: _encode_uuid,
- Binary: _encode_binary,
- Int64: _encode_long,
- Code: _encode_code,
- DBRef: _encode_dbref,
- MaxKey: _encode_maxkey,
- MinKey: _encode_minkey,
- ObjectId: _encode_objectid,
- Regex: _encode_regex,
- RE_TYPE: _encode_regex,
- SON: _encode_mapping,
- Timestamp: _encode_timestamp,
- UUIDLegacy: _encode_binary,
- Decimal128: _encode_decimal128,
- # Special case. This will never be looked up directly.
- abc.Mapping: _encode_mapping,
- }
- _MARKERS = {
- 5: _encode_binary,
- 7: _encode_objectid,
- 11: _encode_regex,
- 13: _encode_code,
- 17: _encode_timestamp,
- 18: _encode_long,
- 100: _encode_dbref,
- 127: _encode_maxkey,
- 255: _encode_minkey,
- }
- if not PY3:
- _ENCODERS[long] = _encode_long
- _BUILT_IN_TYPES = tuple(t for t in _ENCODERS)
- def _name_value_to_bson(name, value, check_keys, opts,
- in_custom_call=False,
- in_fallback_call=False):
- """Encode a single name, value pair."""
- # First see if the type is already cached. KeyError will only ever
- # happen once per subtype.
- try:
- return _ENCODERS[type(value)](name, value, check_keys, opts)
- except KeyError:
- pass
- # Second, fall back to trying _type_marker. This has to be done
- # before the loop below since users could subclass one of our
- # custom types that subclasses a python built-in (e.g. Binary)
- marker = getattr(value, "_type_marker", None)
- if isinstance(marker, int) and marker in _MARKERS:
- func = _MARKERS[marker]
- # Cache this type for faster subsequent lookup.
- _ENCODERS[type(value)] = func
- return func(name, value, check_keys, opts)
- # Third, check if a type encoder is registered for this type.
- # Note that subtypes of registered custom types are not auto-encoded.
- if not in_custom_call and opts.type_registry._encoder_map:
- custom_encoder = opts.type_registry._encoder_map.get(type(value))
- if custom_encoder is not None:
- return _name_value_to_bson(
- name, custom_encoder(value), check_keys, opts,
- in_custom_call=True)
- # Fourth, test each base type. This will only happen once for
- # a subtype of a supported base type. Unlike in the C-extensions, this
- # is done after trying the custom type encoder because checking for each
- # subtype is expensive.
- for base in _BUILT_IN_TYPES:
- if isinstance(value, base):
- func = _ENCODERS[base]
- # Cache this type for faster subsequent lookup.
- _ENCODERS[type(value)] = func
- return func(name, value, check_keys, opts)
- # As a last resort, try using the fallback encoder, if the user has
- # provided one.
- fallback_encoder = opts.type_registry._fallback_encoder
- if not in_fallback_call and fallback_encoder is not None:
- return _name_value_to_bson(
- name, fallback_encoder(value), check_keys, opts,
- in_fallback_call=True)
- raise InvalidDocument(
- "cannot encode object: %r, of type: %r" % (value, type(value)))
- def _element_to_bson(key, value, check_keys, opts):
- """Encode a single key, value pair."""
- if not isinstance(key, string_type):
- raise InvalidDocument("documents must have only string keys, "
- "key was %r" % (key,))
- if check_keys:
- if key.startswith("$"):
- raise InvalidDocument("key %r must not start with '$'" % (key,))
- if "." in key:
- raise InvalidDocument("key %r must not contain '.'" % (key,))
- name = _make_name(key)
- return _name_value_to_bson(name, value, check_keys, opts)
- def _dict_to_bson(doc, check_keys, opts, top_level=True):
- """Encode a document to BSON."""
- if _raw_document_class(doc):
- return doc.raw
- try:
- elements = []
- if top_level and "_id" in doc:
- elements.append(_name_value_to_bson(b"_id\x00", doc["_id"],
- check_keys, opts))
- for (key, value) in iteritems(doc):
- if not top_level or key != "_id":
- elements.append(_element_to_bson(key, value,
- check_keys, opts))
- except AttributeError:
- raise TypeError("encoder expected a mapping type but got: %r" % (doc,))
- encoded = b"".join(elements)
- return _PACK_INT(len(encoded) + 5) + encoded + b"\x00"
- if _USE_C:
- _dict_to_bson = _cbson._dict_to_bson
- def _millis_to_datetime(millis, opts):
- """Convert milliseconds since epoch UTC to datetime."""
- diff = ((millis % 1000) + 1000) % 1000
- seconds = (millis - diff) // 1000
- micros = diff * 1000
- if opts.tz_aware:
- dt = EPOCH_AWARE + datetime.timedelta(seconds=seconds,
- microseconds=micros)
- if opts.tzinfo:
- dt = dt.astimezone(opts.tzinfo)
- return dt
- else:
- return EPOCH_NAIVE + datetime.timedelta(seconds=seconds,
- microseconds=micros)
- def _datetime_to_millis(dtm):
- """Convert datetime to milliseconds since epoch UTC."""
- if dtm.utcoffset() is not None:
- dtm = dtm - dtm.utcoffset()
- return int(calendar.timegm(dtm.timetuple()) * 1000 +
- dtm.microsecond // 1000)
- _CODEC_OPTIONS_TYPE_ERROR = TypeError(
- "codec_options must be an instance of CodecOptions")
- def encode(document, check_keys=False, codec_options=DEFAULT_CODEC_OPTIONS):
- """Encode a document to BSON.
- A document can be any mapping type (like :class:`dict`).
- Raises :class:`TypeError` if `document` is not a mapping type,
- or contains keys that are not instances of
- :class:`basestring` (:class:`str` in python 3). Raises
- :class:`~bson.errors.InvalidDocument` if `document` cannot be
- converted to :class:`BSON`.
- :Parameters:
- - `document`: mapping type representing a document
- - `check_keys` (optional): check if keys start with '$' or
- contain '.', raising :class:`~bson.errors.InvalidDocument` in
- either case
- - `codec_options` (optional): An instance of
- :class:`~bson.codec_options.CodecOptions`.
- .. versionadded:: 3.9
- """
- if not isinstance(codec_options, CodecOptions):
- raise _CODEC_OPTIONS_TYPE_ERROR
- return _dict_to_bson(document, check_keys, codec_options)
- def decode(data, codec_options=DEFAULT_CODEC_OPTIONS):
- """Decode BSON to a document.
- By default, returns a BSON document represented as a Python
- :class:`dict`. To use a different :class:`MutableMapping` class,
- configure a :class:`~bson.codec_options.CodecOptions`::
- >>> import collections # From Python standard library.
- >>> import bson
- >>> from bson.codec_options import CodecOptions
- >>> data = bson.encode({'a': 1})
- >>> decoded_doc = bson.decode(data)
- <type 'dict'>
- >>> options = CodecOptions(document_class=collections.OrderedDict)
- >>> decoded_doc = bson.decode(data, codec_options=options)
- >>> type(decoded_doc)
- <class 'collections.OrderedDict'>
- :Parameters:
- - `data`: the BSON to decode. Any bytes-like object that implements
- the buffer protocol.
- - `codec_options` (optional): An instance of
- :class:`~bson.codec_options.CodecOptions`.
- .. versionadded:: 3.9
- """
- if not isinstance(codec_options, CodecOptions):
- raise _CODEC_OPTIONS_TYPE_ERROR
- return _bson_to_dict(data, codec_options)
- def decode_all(data, codec_options=DEFAULT_CODEC_OPTIONS):
- """Decode BSON data to multiple documents.
- `data` must be a bytes-like object implementing the buffer protocol that
- provides concatenated, valid, BSON-encoded documents.
- :Parameters:
- - `data`: BSON data
- - `codec_options` (optional): An instance of
- :class:`~bson.codec_options.CodecOptions`.
- .. versionchanged:: 3.9
- Supports bytes-like objects that implement the buffer protocol.
- .. versionchanged:: 3.0
- Removed `compile_re` option: PyMongo now always represents BSON regular
- expressions as :class:`~bson.regex.Regex` objects. Use
- :meth:`~bson.regex.Regex.try_compile` to attempt to convert from a
- BSON regular expression to a Python regular expression object.
- Replaced `as_class`, `tz_aware`, and `uuid_subtype` options with
- `codec_options`.
- .. versionchanged:: 2.7
- Added `compile_re` option. If set to False, PyMongo represented BSON
- regular expressions as :class:`~bson.regex.Regex` objects instead of
- attempting to compile BSON regular expressions as Python native
- regular expressions, thus preventing errors for some incompatible
- patterns, see `PYTHON-500`_.
- .. _PYTHON-500: https://jira.mongodb.org/browse/PYTHON-500
- """
- data, view = get_data_and_view(data)
- if not isinstance(codec_options, CodecOptions):
- raise _CODEC_OPTIONS_TYPE_ERROR
- data_len = len(data)
- docs = []
- position = 0
- end = data_len - 1
- use_raw = _raw_document_class(codec_options.document_class)
- try:
- while position < end:
- obj_size = _UNPACK_INT_FROM(data, position)[0]
- if data_len - position < obj_size:
- raise InvalidBSON("invalid object size")
- obj_end = position + obj_size - 1
- if data[obj_end] != _OBJEND:
- raise InvalidBSON("bad eoo")
- if use_raw:
- docs.append(
- codec_options.document_class(
- data[position:obj_end + 1], codec_options))
- else:
- docs.append(_elements_to_dict(data,
- view,
- position + 4,
- obj_end,
- codec_options))
- position += obj_size
- return docs
- except InvalidBSON:
- raise
- except Exception:
- # Change exception type to InvalidBSON but preserve traceback.
- _, exc_value, exc_tb = sys.exc_info()
- reraise(InvalidBSON, exc_value, exc_tb)
- if _USE_C:
- decode_all = _cbson.decode_all
- def _decode_selective(rawdoc, fields, codec_options):
- if _raw_document_class(codec_options.document_class):
- # If document_class is RawBSONDocument, use vanilla dictionary for
- # decoding command response.
- doc = {}
- else:
- # Else, use the specified document_class.
- doc = codec_options.document_class()
- for key, value in iteritems(rawdoc):
- if key in fields:
- if fields[key] == 1:
- doc[key] = _bson_to_dict(rawdoc.raw, codec_options)[key]
- else:
- doc[key] = _decode_selective(value, fields[key], codec_options)
- else:
- doc[key] = value
- return doc
- def _convert_raw_document_lists_to_streams(document):
- cursor = document.get('cursor')
- if cursor:
- for key in ('firstBatch', 'nextBatch'):
- batch = cursor.get(key)
- if batch:
- stream = b"".join(doc.raw for doc in batch)
- cursor[key] = [stream]
- def _decode_all_selective(data, codec_options, fields):
- """Decode BSON data to a single document while using user-provided
- custom decoding logic.
- `data` must be a string representing a valid, BSON-encoded document.
- :Parameters:
- - `data`: BSON data
- - `codec_options`: An instance of
- :class:`~bson.codec_options.CodecOptions` with user-specified type
- decoders. If no decoders are found, this method is the same as
- ``decode_all``.
- - `fields`: Map of document namespaces where data that needs
- to be custom decoded lives or None. For example, to custom decode a
- list of objects in 'field1.subfield1', the specified value should be
- ``{'field1': {'subfield1': 1}}``. If ``fields`` is an empty map or
- None, this method is the same as ``decode_all``.
- :Returns:
- - `document_list`: Single-member list containing the decoded document.
- .. versionadded:: 3.8
- """
- if not codec_options.type_registry._decoder_map:
- return decode_all(data, codec_options)
- if not fields:
- return decode_all(data, codec_options.with_options(type_registry=None))
- # Decode documents for internal use.
- from bson.raw_bson import RawBSONDocument
- internal_codec_options = codec_options.with_options(
- document_class=RawBSONDocument, type_registry=None)
- _doc = _bson_to_dict(data, internal_codec_options)
- return [_decode_selective(_doc, fields, codec_options,)]
- def decode_iter(data, codec_options=DEFAULT_CODEC_OPTIONS):
- """Decode BSON data to multiple documents as a generator.
- Works similarly to the decode_all function, but yields one document at a
- time.
- `data` must be a string of concatenated, valid, BSON-encoded
- documents.
- :Parameters:
- - `data`: BSON data
- - `codec_options` (optional): An instance of
- :class:`~bson.codec_options.CodecOptions`.
- .. versionchanged:: 3.0
- Replaced `as_class`, `tz_aware`, and `uuid_subtype` options with
- `codec_options`.
- .. versionadded:: 2.8
- """
- if not isinstance(codec_options, CodecOptions):
- raise _CODEC_OPTIONS_TYPE_ERROR
- position = 0
- end = len(data) - 1
- while position < end:
- obj_size = _UNPACK_INT_FROM(data, position)[0]
- elements = data[position:position + obj_size]
- position += obj_size
- yield _bson_to_dict(elements, codec_options)
- def decode_file_iter(file_obj, codec_options=DEFAULT_CODEC_OPTIONS):
- """Decode bson data from a file to multiple documents as a generator.
- Works similarly to the decode_all function, but reads from the file object
- in chunks and parses bson in chunks, yielding one document at a time.
- :Parameters:
- - `file_obj`: A file object containing BSON data.
- - `codec_options` (optional): An instance of
- :class:`~bson.codec_options.CodecOptions`.
- .. versionchanged:: 3.0
- Replaced `as_class`, `tz_aware`, and `uuid_subtype` options with
- `codec_options`.
- .. versionadded:: 2.8
- """
- while True:
- # Read size of next object.
- size_data = file_obj.read(4)
- if not size_data:
- break # Finished with file normaly.
- elif len(size_data) != 4:
- raise InvalidBSON("cut off in middle of objsize")
- obj_size = _UNPACK_INT_FROM(size_data, 0)[0] - 4
- elements = size_data + file_obj.read(max(0, obj_size))
- yield _bson_to_dict(elements, codec_options)
- def is_valid(bson):
- """Check that the given string represents valid :class:`BSON` data.
- Raises :class:`TypeError` if `bson` is not an instance of
- :class:`str` (:class:`bytes` in python 3). Returns ``True``
- if `bson` is valid :class:`BSON`, ``False`` otherwise.
- :Parameters:
- - `bson`: the data to be validated
- """
- if not isinstance(bson, bytes):
- raise TypeError("BSON data must be an instance of a subclass of bytes")
- try:
- _bson_to_dict(bson, DEFAULT_CODEC_OPTIONS)
- return True
- except Exception:
- return False
- class BSON(bytes):
- """BSON (Binary JSON) data.
- .. warning:: Using this class to encode and decode BSON adds a performance
- cost. For better performance use the module level functions
- :func:`encode` and :func:`decode` instead.
- """
- @classmethod
- def encode(cls, document, check_keys=False,
- codec_options=DEFAULT_CODEC_OPTIONS):
- """Encode a document to a new :class:`BSON` instance.
- A document can be any mapping type (like :class:`dict`).
- Raises :class:`TypeError` if `document` is not a mapping type,
- or contains keys that are not instances of
- :class:`basestring` (:class:`str` in python 3). Raises
- :class:`~bson.errors.InvalidDocument` if `document` cannot be
- converted to :class:`BSON`.
- :Parameters:
- - `document`: mapping type representing a document
- - `check_keys` (optional): check if keys start with '$' or
- contain '.', raising :class:`~bson.errors.InvalidDocument` in
- either case
- - `codec_options` (optional): An instance of
- :class:`~bson.codec_options.CodecOptions`.
- .. versionchanged:: 3.0
- Replaced `uuid_subtype` option with `codec_options`.
- """
- return cls(encode(document, check_keys, codec_options))
- def decode(self, codec_options=DEFAULT_CODEC_OPTIONS):
- """Decode this BSON data.
- By default, returns a BSON document represented as a Python
- :class:`dict`. To use a different :class:`MutableMapping` class,
- configure a :class:`~bson.codec_options.CodecOptions`::
- >>> import collections # From Python standard library.
- >>> import bson
- >>> from bson.codec_options import CodecOptions
- >>> data = bson.BSON.encode({'a': 1})
- >>> decoded_doc = bson.BSON(data).decode()
- <type 'dict'>
- >>> options = CodecOptions(document_class=collections.OrderedDict)
- >>> decoded_doc = bson.BSON(data).decode(codec_options=options)
- >>> type(decoded_doc)
- <class 'collections.OrderedDict'>
- :Parameters:
- - `codec_options` (optional): An instance of
- :class:`~bson.codec_options.CodecOptions`.
- .. versionchanged:: 3.0
- Removed `compile_re` option: PyMongo now always represents BSON
- regular expressions as :class:`~bson.regex.Regex` objects. Use
- :meth:`~bson.regex.Regex.try_compile` to attempt to convert from a
- BSON regular expression to a Python regular expression object.
- Replaced `as_class`, `tz_aware`, and `uuid_subtype` options with
- `codec_options`.
- .. versionchanged:: 2.7
- Added `compile_re` option. If set to False, PyMongo represented BSON
- regular expressions as :class:`~bson.regex.Regex` objects instead of
- attempting to compile BSON regular expressions as Python native
- regular expressions, thus preventing errors for some incompatible
- patterns, see `PYTHON-500`_.
- .. _PYTHON-500: https://jira.mongodb.org/browse/PYTHON-500
- """
- return decode(self, codec_options)
- def has_c():
- """Is the C extension installed?
- """
- return _USE_C
|