spider.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import logging
  2. import inspect
  3. import six
  4. from scrapy.spiders import Spider
  5. from scrapy.utils.misc import arg_to_iter
  6. logger = logging.getLogger(__name__)
  7. def iterate_spider_output(result):
  8. return arg_to_iter(result)
  9. def iter_spider_classes(module):
  10. """Return an iterator over all spider classes defined in the given module
  11. that can be instantiated (ie. which have name)
  12. """
  13. # this needs to be imported here until get rid of the spider manager
  14. # singleton in scrapy.spider.spiders
  15. from scrapy.spiders import Spider
  16. for obj in six.itervalues(vars(module)):
  17. if inspect.isclass(obj) and \
  18. issubclass(obj, Spider) and \
  19. obj.__module__ == module.__name__ and \
  20. getattr(obj, 'name', None):
  21. yield obj
  22. def spidercls_for_request(spider_loader, request, default_spidercls=None,
  23. log_none=False, log_multiple=False):
  24. """Return a spider class that handles the given Request.
  25. This will look for the spiders that can handle the given request (using
  26. the spider loader) and return a Spider class if (and only if) there is
  27. only one Spider able to handle the Request.
  28. If multiple spiders (or no spider) are found, it will return the
  29. default_spidercls passed. It can optionally log if multiple or no spiders
  30. are found.
  31. """
  32. snames = spider_loader.find_by_request(request)
  33. if len(snames) == 1:
  34. return spider_loader.load(snames[0])
  35. if len(snames) > 1 and log_multiple:
  36. logger.error('More than one spider can handle: %(request)s - %(snames)s',
  37. {'request': request, 'snames': ', '.join(snames)})
  38. if len(snames) == 0 and log_none:
  39. logger.error('Unable to find spider that handles: %(request)s',
  40. {'request': request})
  41. return default_spidercls
  42. class DefaultSpider(Spider):
  43. name = 'default'