123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350 |
- """
- Item Exporters are used to export/serialize items into different formats.
- """
- import csv
- import io
- import sys
- import pprint
- import marshal
- import six
- from six.moves import cPickle as pickle
- from xml.sax.saxutils import XMLGenerator
- from scrapy.utils.serialize import ScrapyJSONEncoder
- from scrapy.utils.python import to_bytes, to_unicode, to_native_str, is_listlike
- from scrapy.item import BaseItem
- from scrapy.exceptions import ScrapyDeprecationWarning
- import warnings
- __all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
- 'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
- 'JsonItemExporter', 'MarshalItemExporter']
- class BaseItemExporter(object):
- def __init__(self, **kwargs):
- self._configure(kwargs)
- def _configure(self, options, dont_fail=False):
- """Configure the exporter by poping options from the ``options`` dict.
- If dont_fail is set, it won't raise an exception on unexpected options
- (useful for using with keyword arguments in subclasses constructors)
- """
- self.encoding = options.pop('encoding', None)
- self.fields_to_export = options.pop('fields_to_export', None)
- self.export_empty_fields = options.pop('export_empty_fields', False)
- self.indent = options.pop('indent', None)
- if not dont_fail and options:
- raise TypeError("Unexpected options: %s" % ', '.join(options.keys()))
- def export_item(self, item):
- raise NotImplementedError
- def serialize_field(self, field, name, value):
- serializer = field.get('serializer', lambda x: x)
- return serializer(value)
- def start_exporting(self):
- pass
- def finish_exporting(self):
- pass
- def _get_serialized_fields(self, item, default_value=None, include_empty=None):
- """Return the fields to export as an iterable of tuples
- (name, serialized_value)
- """
- if include_empty is None:
- include_empty = self.export_empty_fields
- if self.fields_to_export is None:
- if include_empty and not isinstance(item, dict):
- field_iter = six.iterkeys(item.fields)
- else:
- field_iter = six.iterkeys(item)
- else:
- if include_empty:
- field_iter = self.fields_to_export
- else:
- field_iter = (x for x in self.fields_to_export if x in item)
- for field_name in field_iter:
- if field_name in item:
- field = {} if isinstance(item, dict) else item.fields[field_name]
- value = self.serialize_field(field, field_name, item[field_name])
- else:
- value = default_value
- yield field_name, value
- class JsonLinesItemExporter(BaseItemExporter):
- def __init__(self, file, **kwargs):
- self._configure(kwargs, dont_fail=True)
- self.file = file
- kwargs.setdefault('ensure_ascii', not self.encoding)
- self.encoder = ScrapyJSONEncoder(**kwargs)
- def export_item(self, item):
- itemdict = dict(self._get_serialized_fields(item))
- data = self.encoder.encode(itemdict) + '\n'
- self.file.write(to_bytes(data, self.encoding))
- class JsonItemExporter(BaseItemExporter):
- def __init__(self, file, **kwargs):
- self._configure(kwargs, dont_fail=True)
- self.file = file
- # there is a small difference between the behaviour or JsonItemExporter.indent
- # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
- # the addition of newlines everywhere
- json_indent = self.indent if self.indent is not None and self.indent > 0 else None
- kwargs.setdefault('indent', json_indent)
- kwargs.setdefault('ensure_ascii', not self.encoding)
- self.encoder = ScrapyJSONEncoder(**kwargs)
- self.first_item = True
- def _beautify_newline(self):
- if self.indent is not None:
- self.file.write(b'\n')
- def start_exporting(self):
- self.file.write(b"[")
- self._beautify_newline()
- def finish_exporting(self):
- self._beautify_newline()
- self.file.write(b"]")
- def export_item(self, item):
- if self.first_item:
- self.first_item = False
- else:
- self.file.write(b',')
- self._beautify_newline()
- itemdict = dict(self._get_serialized_fields(item))
- data = self.encoder.encode(itemdict)
- self.file.write(to_bytes(data, self.encoding))
- class XmlItemExporter(BaseItemExporter):
- def __init__(self, file, **kwargs):
- self.item_element = kwargs.pop('item_element', 'item')
- self.root_element = kwargs.pop('root_element', 'items')
- self._configure(kwargs)
- if not self.encoding:
- self.encoding = 'utf-8'
- self.xg = XMLGenerator(file, encoding=self.encoding)
- def _beautify_newline(self, new_item=False):
- if self.indent is not None and (self.indent > 0 or new_item):
- self._xg_characters('\n')
- def _beautify_indent(self, depth=1):
- if self.indent:
- self._xg_characters(' ' * self.indent * depth)
- def start_exporting(self):
- self.xg.startDocument()
- self.xg.startElement(self.root_element, {})
- self._beautify_newline(new_item=True)
- def export_item(self, item):
- self._beautify_indent(depth=1)
- self.xg.startElement(self.item_element, {})
- self._beautify_newline()
- for name, value in self._get_serialized_fields(item, default_value=''):
- self._export_xml_field(name, value, depth=2)
- self._beautify_indent(depth=1)
- self.xg.endElement(self.item_element)
- self._beautify_newline(new_item=True)
- def finish_exporting(self):
- self.xg.endElement(self.root_element)
- self.xg.endDocument()
- def _export_xml_field(self, name, serialized_value, depth):
- self._beautify_indent(depth=depth)
- self.xg.startElement(name, {})
- if hasattr(serialized_value, 'items'):
- self._beautify_newline()
- for subname, value in serialized_value.items():
- self._export_xml_field(subname, value, depth=depth+1)
- self._beautify_indent(depth=depth)
- elif is_listlike(serialized_value):
- self._beautify_newline()
- for value in serialized_value:
- self._export_xml_field('value', value, depth=depth+1)
- self._beautify_indent(depth=depth)
- elif isinstance(serialized_value, six.text_type):
- self._xg_characters(serialized_value)
- else:
- self._xg_characters(str(serialized_value))
- self.xg.endElement(name)
- self._beautify_newline()
- # Workaround for https://bugs.python.org/issue17606
- # Before Python 2.7.4 xml.sax.saxutils required bytes;
- # since 2.7.4 it requires unicode. The bug is likely to be
- # fixed in 2.7.6, but 2.7.6 will still support unicode,
- # and Python 3.x will require unicode, so ">= 2.7.4" should be fine.
- if sys.version_info[:3] >= (2, 7, 4):
- def _xg_characters(self, serialized_value):
- if not isinstance(serialized_value, six.text_type):
- serialized_value = serialized_value.decode(self.encoding)
- return self.xg.characters(serialized_value)
- else: # pragma: no cover
- def _xg_characters(self, serialized_value):
- return self.xg.characters(serialized_value)
- class CsvItemExporter(BaseItemExporter):
- def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
- self._configure(kwargs, dont_fail=True)
- if not self.encoding:
- self.encoding = 'utf-8'
- self.include_headers_line = include_headers_line
- self.stream = io.TextIOWrapper(
- file,
- line_buffering=False,
- write_through=True,
- encoding=self.encoding,
- newline='' # Windows needs this https://github.com/scrapy/scrapy/issues/3034
- ) if six.PY3 else file
- self.csv_writer = csv.writer(self.stream, **kwargs)
- self._headers_not_written = True
- self._join_multivalued = join_multivalued
- def serialize_field(self, field, name, value):
- serializer = field.get('serializer', self._join_if_needed)
- return serializer(value)
- def _join_if_needed(self, value):
- if isinstance(value, (list, tuple)):
- try:
- return self._join_multivalued.join(value)
- except TypeError: # list in value may not contain strings
- pass
- return value
- def export_item(self, item):
- if self._headers_not_written:
- self._headers_not_written = False
- self._write_headers_and_set_fields_to_export(item)
- fields = self._get_serialized_fields(item, default_value='',
- include_empty=True)
- values = list(self._build_row(x for _, x in fields))
- self.csv_writer.writerow(values)
- def _build_row(self, values):
- for s in values:
- try:
- yield to_native_str(s, self.encoding)
- except TypeError:
- yield s
- def _write_headers_and_set_fields_to_export(self, item):
- if self.include_headers_line:
- if not self.fields_to_export:
- if isinstance(item, dict):
- # for dicts try using fields of the first item
- self.fields_to_export = list(item.keys())
- else:
- # use fields declared in Item
- self.fields_to_export = list(item.fields.keys())
- row = list(self._build_row(self.fields_to_export))
- self.csv_writer.writerow(row)
- class PickleItemExporter(BaseItemExporter):
- def __init__(self, file, protocol=2, **kwargs):
- self._configure(kwargs)
- self.file = file
- self.protocol = protocol
- def export_item(self, item):
- d = dict(self._get_serialized_fields(item))
- pickle.dump(d, self.file, self.protocol)
- class MarshalItemExporter(BaseItemExporter):
- """Exports items in a Python-specific binary format (see
- :mod:`marshal`).
- :param file: The file-like object to use for exporting the data. Its
- ``write`` method should accept :class:`bytes` (a disk file
- opened in binary mode, a :class:`~io.BytesIO` object, etc)
- """
- def __init__(self, file, **kwargs):
- self._configure(kwargs)
- self.file = file
- def export_item(self, item):
- marshal.dump(dict(self._get_serialized_fields(item)), self.file)
- class PprintItemExporter(BaseItemExporter):
- def __init__(self, file, **kwargs):
- self._configure(kwargs)
- self.file = file
- def export_item(self, item):
- itemdict = dict(self._get_serialized_fields(item))
- self.file.write(to_bytes(pprint.pformat(itemdict) + '\n'))
- class PythonItemExporter(BaseItemExporter):
- """This is a base class for item exporters that extends
- :class:`BaseItemExporter` with support for nested items.
- It serializes items to built-in Python types, so that any serialization
- library (e.g. :mod:`json` or msgpack_) can be used on top of it.
- .. _msgpack: https://pypi.org/project/msgpack/
- """
- def _configure(self, options, dont_fail=False):
- self.binary = options.pop('binary', True)
- super(PythonItemExporter, self)._configure(options, dont_fail)
- if self.binary:
- warnings.warn(
- "PythonItemExporter will drop support for binary export in the future",
- ScrapyDeprecationWarning)
- if not self.encoding:
- self.encoding = 'utf-8'
- def serialize_field(self, field, name, value):
- serializer = field.get('serializer', self._serialize_value)
- return serializer(value)
- def _serialize_value(self, value):
- if isinstance(value, BaseItem):
- return self.export_item(value)
- if isinstance(value, dict):
- return dict(self._serialize_dict(value))
- if is_listlike(value):
- return [self._serialize_value(v) for v in value]
- encode_func = to_bytes if self.binary else to_unicode
- if isinstance(value, (six.text_type, bytes)):
- return encode_func(value, encoding=self.encoding)
- return value
- def _serialize_dict(self, value):
- for key, val in six.iteritems(value):
- key = to_bytes(key) if self.binary else key
- yield key, self._serialize_value(val)
- def export_item(self, item):
- result = dict(self._get_serialized_fields(item))
- if self.binary:
- result = dict(self._serialize_dict(result))
- return result
|