__init__.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. """
  2. Item Loader
  3. See documentation in docs/topics/loaders.rst
  4. """
  5. from collections import defaultdict
  6. import six
  7. from scrapy.item import Item
  8. from scrapy.loader.common import wrap_loader_context
  9. from scrapy.loader.processors import Identity
  10. from scrapy.selector import Selector
  11. from scrapy.utils.misc import arg_to_iter, extract_regex
  12. from scrapy.utils.python import flatten
  13. class ItemLoader(object):
  14. default_item_class = Item
  15. default_input_processor = Identity()
  16. default_output_processor = Identity()
  17. default_selector_class = Selector
  18. def __init__(self, item=None, selector=None, response=None, parent=None, **context):
  19. if selector is None and response is not None:
  20. selector = self.default_selector_class(response)
  21. self.selector = selector
  22. context.update(selector=selector, response=response)
  23. if item is None:
  24. item = self.default_item_class()
  25. self.context = context
  26. self.parent = parent
  27. self._local_item = context['item'] = item
  28. self._local_values = defaultdict(list)
  29. # values from initial item
  30. for field_name, value in item.items():
  31. self._values[field_name] += arg_to_iter(value)
  32. @property
  33. def _values(self):
  34. if self.parent is not None:
  35. return self.parent._values
  36. else:
  37. return self._local_values
  38. @property
  39. def item(self):
  40. if self.parent is not None:
  41. return self.parent.item
  42. else:
  43. return self._local_item
  44. def nested_xpath(self, xpath, **context):
  45. selector = self.selector.xpath(xpath)
  46. context.update(selector=selector)
  47. subloader = self.__class__(
  48. item=self.item, parent=self, **context
  49. )
  50. return subloader
  51. def nested_css(self, css, **context):
  52. selector = self.selector.css(css)
  53. context.update(selector=selector)
  54. subloader = self.__class__(
  55. item=self.item, parent=self, **context
  56. )
  57. return subloader
  58. def add_value(self, field_name, value, *processors, **kw):
  59. value = self.get_value(value, *processors, **kw)
  60. if value is None:
  61. return
  62. if not field_name:
  63. for k, v in six.iteritems(value):
  64. self._add_value(k, v)
  65. else:
  66. self._add_value(field_name, value)
  67. def replace_value(self, field_name, value, *processors, **kw):
  68. value = self.get_value(value, *processors, **kw)
  69. if value is None:
  70. return
  71. if not field_name:
  72. for k, v in six.iteritems(value):
  73. self._replace_value(k, v)
  74. else:
  75. self._replace_value(field_name, value)
  76. def _add_value(self, field_name, value):
  77. value = arg_to_iter(value)
  78. processed_value = self._process_input_value(field_name, value)
  79. if processed_value:
  80. self._values[field_name] += arg_to_iter(processed_value)
  81. def _replace_value(self, field_name, value):
  82. self._values.pop(field_name, None)
  83. self._add_value(field_name, value)
  84. def get_value(self, value, *processors, **kw):
  85. regex = kw.get('re', None)
  86. if regex:
  87. value = arg_to_iter(value)
  88. value = flatten(extract_regex(regex, x) for x in value)
  89. for proc in processors:
  90. if value is None:
  91. break
  92. _proc = proc
  93. proc = wrap_loader_context(proc, self.context)
  94. try:
  95. value = proc(value)
  96. except Exception as e:
  97. raise ValueError("Error with processor %s value=%r error='%s: %s'" %
  98. (_proc.__class__.__name__, value,
  99. type(e).__name__, str(e)))
  100. return value
  101. def load_item(self):
  102. item = self.item
  103. for field_name in tuple(self._values):
  104. value = self.get_output_value(field_name)
  105. if value is not None:
  106. item[field_name] = value
  107. return item
  108. def get_output_value(self, field_name):
  109. proc = self.get_output_processor(field_name)
  110. proc = wrap_loader_context(proc, self.context)
  111. try:
  112. return proc(self._values[field_name])
  113. except Exception as e:
  114. raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" %
  115. (field_name, self._values[field_name], type(e).__name__, str(e)))
  116. def get_collected_values(self, field_name):
  117. return self._values[field_name]
  118. def get_input_processor(self, field_name):
  119. proc = getattr(self, '%s_in' % field_name, None)
  120. if not proc:
  121. proc = self._get_item_field_attr(field_name, 'input_processor',
  122. self.default_input_processor)
  123. return proc
  124. def get_output_processor(self, field_name):
  125. proc = getattr(self, '%s_out' % field_name, None)
  126. if not proc:
  127. proc = self._get_item_field_attr(field_name, 'output_processor',
  128. self.default_output_processor)
  129. return proc
  130. def _process_input_value(self, field_name, value):
  131. proc = self.get_input_processor(field_name)
  132. _proc = proc
  133. proc = wrap_loader_context(proc, self.context)
  134. try:
  135. return proc(value)
  136. except Exception as e:
  137. raise ValueError(
  138. "Error with input processor %s: field=%r value=%r "
  139. "error='%s: %s'" % (_proc.__class__.__name__, field_name,
  140. value, type(e).__name__, str(e)))
  141. def _get_item_field_attr(self, field_name, key, default=None):
  142. if isinstance(self.item, Item):
  143. value = self.item.fields[field_name].get(key, default)
  144. else:
  145. value = default
  146. return value
  147. def _check_selector_method(self):
  148. if self.selector is None:
  149. raise RuntimeError("To use XPath or CSS selectors, "
  150. "%s must be instantiated with a selector "
  151. "or a response" % self.__class__.__name__)
  152. def add_xpath(self, field_name, xpath, *processors, **kw):
  153. values = self._get_xpathvalues(xpath, **kw)
  154. self.add_value(field_name, values, *processors, **kw)
  155. def replace_xpath(self, field_name, xpath, *processors, **kw):
  156. values = self._get_xpathvalues(xpath, **kw)
  157. self.replace_value(field_name, values, *processors, **kw)
  158. def get_xpath(self, xpath, *processors, **kw):
  159. values = self._get_xpathvalues(xpath, **kw)
  160. return self.get_value(values, *processors, **kw)
  161. def _get_xpathvalues(self, xpaths, **kw):
  162. self._check_selector_method()
  163. xpaths = arg_to_iter(xpaths)
  164. return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths)
  165. def add_css(self, field_name, css, *processors, **kw):
  166. values = self._get_cssvalues(css, **kw)
  167. self.add_value(field_name, values, *processors, **kw)
  168. def replace_css(self, field_name, css, *processors, **kw):
  169. values = self._get_cssvalues(css, **kw)
  170. self.replace_value(field_name, values, *processors, **kw)
  171. def get_css(self, css, *processors, **kw):
  172. values = self._get_cssvalues(css, **kw)
  173. return self.get_value(values, *processors, **kw)
  174. def _get_cssvalues(self, csss, **kw):
  175. self._check_selector_method()
  176. csss = arg_to_iter(csss)
  177. return flatten(self.selector.css(css).getall() for css in csss)