processors.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. """
  2. This module provides some commonly used processors for Item Loaders.
  3. See documentation in docs/topics/loaders.rst
  4. """
  5. try:
  6. from collections import ChainMap
  7. except ImportError:
  8. from scrapy.utils.datatypes import MergeDict as ChainMap
  9. from scrapy.utils.misc import arg_to_iter
  10. from scrapy.loader.common import wrap_loader_context
  11. class MapCompose(object):
  12. def __init__(self, *functions, **default_loader_context):
  13. self.functions = functions
  14. self.default_loader_context = default_loader_context
  15. def __call__(self, value, loader_context=None):
  16. values = arg_to_iter(value)
  17. if loader_context:
  18. context = ChainMap(loader_context, self.default_loader_context)
  19. else:
  20. context = self.default_loader_context
  21. wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
  22. for func in wrapped_funcs:
  23. next_values = []
  24. for v in values:
  25. try:
  26. next_values += arg_to_iter(func(v))
  27. except Exception as e:
  28. raise ValueError("Error in MapCompose with "
  29. "%s value=%r error='%s: %s'" %
  30. (str(func), value, type(e).__name__,
  31. str(e)))
  32. values = next_values
  33. return values
  34. class Compose(object):
  35. def __init__(self, *functions, **default_loader_context):
  36. self.functions = functions
  37. self.stop_on_none = default_loader_context.get('stop_on_none', True)
  38. self.default_loader_context = default_loader_context
  39. def __call__(self, value, loader_context=None):
  40. if loader_context:
  41. context = ChainMap(loader_context, self.default_loader_context)
  42. else:
  43. context = self.default_loader_context
  44. wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions]
  45. for func in wrapped_funcs:
  46. if value is None and self.stop_on_none:
  47. break
  48. try:
  49. value = func(value)
  50. except Exception as e:
  51. raise ValueError("Error in Compose with "
  52. "%s value=%r error='%s: %s'" %
  53. (str(func), value, type(e).__name__, str(e)))
  54. return value
  55. class TakeFirst(object):
  56. def __call__(self, values):
  57. for value in values:
  58. if value is not None and value != '':
  59. return value
  60. class Identity(object):
  61. def __call__(self, values):
  62. return values
  63. class SelectJmes(object):
  64. """
  65. Query the input string for the jmespath (given at instantiation),
  66. and return the answer
  67. Requires : jmespath(https://github.com/jmespath/jmespath)
  68. Note: SelectJmes accepts only one input element at a time.
  69. """
  70. def __init__(self, json_path):
  71. self.json_path = json_path
  72. import jmespath
  73. self.compiled_path = jmespath.compile(self.json_path)
  74. def __call__(self, value):
  75. """Query value for the jmespath query and return answer
  76. :param value: a data structure (dict, list) to extract from
  77. :return: Element extracted according to jmespath query
  78. """
  79. return self.compiled_path.search(value)
  80. class Join(object):
  81. def __init__(self, separator=u' '):
  82. self.separator = separator
  83. def __call__(self, values):
  84. return self.separator.join(values)