spidermw.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. """
  2. Spider Middleware manager
  3. See documentation in docs/topics/spider-middleware.rst
  4. """
  5. from itertools import chain, islice
  6. import six
  7. from twisted.python.failure import Failure
  8. from scrapy.exceptions import _InvalidOutput
  9. from scrapy.middleware import MiddlewareManager
  10. from scrapy.utils.defer import mustbe_deferred
  11. from scrapy.utils.conf import build_component_list
  12. from scrapy.utils.python import MutableChain
  13. def _isiterable(possible_iterator):
  14. return hasattr(possible_iterator, '__iter__')
  15. class SpiderMiddlewareManager(MiddlewareManager):
  16. component_name = 'spider middleware'
  17. @classmethod
  18. def _get_mwlist_from_settings(cls, settings):
  19. return build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES'))
  20. def _add_middleware(self, mw):
  21. super(SpiderMiddlewareManager, self)._add_middleware(mw)
  22. if hasattr(mw, 'process_spider_input'):
  23. self.methods['process_spider_input'].append(mw.process_spider_input)
  24. if hasattr(mw, 'process_start_requests'):
  25. self.methods['process_start_requests'].appendleft(mw.process_start_requests)
  26. self.methods['process_spider_output'].appendleft(getattr(mw, 'process_spider_output', None))
  27. self.methods['process_spider_exception'].appendleft(getattr(mw, 'process_spider_exception', None))
  28. def scrape_response(self, scrape_func, response, request, spider):
  29. fname = lambda f:'%s.%s' % (
  30. six.get_method_self(f).__class__.__name__,
  31. six.get_method_function(f).__name__)
  32. def process_spider_input(response):
  33. for method in self.methods['process_spider_input']:
  34. try:
  35. result = method(response=response, spider=spider)
  36. if result is not None:
  37. raise _InvalidOutput('Middleware {} must return None or raise an exception, got {}' \
  38. .format(fname(method), type(result)))
  39. except _InvalidOutput:
  40. raise
  41. except Exception:
  42. return scrape_func(Failure(), request, spider)
  43. return scrape_func(response, request, spider)
  44. def process_spider_exception(_failure, start_index=0):
  45. exception = _failure.value
  46. # don't handle _InvalidOutput exception
  47. if isinstance(exception, _InvalidOutput):
  48. return _failure
  49. method_list = islice(self.methods['process_spider_exception'], start_index, None)
  50. for method_index, method in enumerate(method_list, start=start_index):
  51. if method is None:
  52. continue
  53. result = method(response=response, exception=exception, spider=spider)
  54. if _isiterable(result):
  55. # stop exception handling by handing control over to the
  56. # process_spider_output chain if an iterable has been returned
  57. return process_spider_output(result, method_index+1)
  58. elif result is None:
  59. continue
  60. else:
  61. raise _InvalidOutput('Middleware {} must return None or an iterable, got {}' \
  62. .format(fname(method), type(result)))
  63. return _failure
  64. def process_spider_output(result, start_index=0):
  65. # items in this iterable do not need to go through the process_spider_output
  66. # chain, they went through it already from the process_spider_exception method
  67. recovered = MutableChain()
  68. def evaluate_iterable(iterable, index):
  69. try:
  70. for r in iterable:
  71. yield r
  72. except Exception as ex:
  73. exception_result = process_spider_exception(Failure(ex), index+1)
  74. if isinstance(exception_result, Failure):
  75. raise
  76. recovered.extend(exception_result)
  77. method_list = islice(self.methods['process_spider_output'], start_index, None)
  78. for method_index, method in enumerate(method_list, start=start_index):
  79. if method is None:
  80. continue
  81. # the following might fail directly if the output value is not a generator
  82. try:
  83. result = method(response=response, result=result, spider=spider)
  84. except Exception as ex:
  85. exception_result = process_spider_exception(Failure(ex), method_index+1)
  86. if isinstance(exception_result, Failure):
  87. raise
  88. return exception_result
  89. if _isiterable(result):
  90. result = evaluate_iterable(result, method_index)
  91. else:
  92. raise _InvalidOutput('Middleware {} must return an iterable, got {}' \
  93. .format(fname(method), type(result)))
  94. return chain(result, recovered)
  95. dfd = mustbe_deferred(process_spider_input, response)
  96. dfd.addCallbacks(callback=process_spider_output, errback=process_spider_exception)
  97. return dfd
  98. def process_start_requests(self, start_requests, spider):
  99. return self._process_chain('process_start_requests', start_requests, spider)