misc.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. """Helper functions which don't fit anywhere else"""
  2. import os
  3. import re
  4. import hashlib
  5. from contextlib import contextmanager
  6. from importlib import import_module
  7. from pkgutil import iter_modules
  8. import six
  9. from w3lib.html import replace_entities
  10. from scrapy.utils.python import flatten, to_unicode
  11. from scrapy.item import BaseItem
  12. _ITERABLE_SINGLE_VALUES = dict, BaseItem, six.text_type, bytes
  13. def arg_to_iter(arg):
  14. """Convert an argument to an iterable. The argument can be a None, single
  15. value, or an iterable.
  16. Exception: if arg is a dict, [arg] will be returned
  17. """
  18. if arg is None:
  19. return []
  20. elif not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, '__iter__'):
  21. return arg
  22. else:
  23. return [arg]
  24. def load_object(path):
  25. """Load an object given its absolute object path, and return it.
  26. object can be a class, function, variable or an instance.
  27. path ie: 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware'
  28. """
  29. try:
  30. dot = path.rindex('.')
  31. except ValueError:
  32. raise ValueError("Error loading object '%s': not a full path" % path)
  33. module, name = path[:dot], path[dot+1:]
  34. mod = import_module(module)
  35. try:
  36. obj = getattr(mod, name)
  37. except AttributeError:
  38. raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
  39. return obj
  40. def walk_modules(path):
  41. """Loads a module and all its submodules from the given module path and
  42. returns them. If *any* module throws an exception while importing, that
  43. exception is thrown back.
  44. For example: walk_modules('scrapy.utils')
  45. """
  46. mods = []
  47. mod = import_module(path)
  48. mods.append(mod)
  49. if hasattr(mod, '__path__'):
  50. for _, subpath, ispkg in iter_modules(mod.__path__):
  51. fullpath = path + '.' + subpath
  52. if ispkg:
  53. mods += walk_modules(fullpath)
  54. else:
  55. submod = import_module(fullpath)
  56. mods.append(submod)
  57. return mods
  58. def extract_regex(regex, text, encoding='utf-8'):
  59. """Extract a list of unicode strings from the given text/encoding using the following policies:
  60. * if the regex contains a named group called "extract" that will be returned
  61. * if the regex contains multiple numbered groups, all those will be returned (flattened)
  62. * if the regex doesn't contain any group the entire regex matching is returned
  63. """
  64. if isinstance(regex, six.string_types):
  65. regex = re.compile(regex, re.UNICODE)
  66. try:
  67. strings = [regex.search(text).group('extract')] # named group
  68. except Exception:
  69. strings = regex.findall(text) # full regex or numbered groups
  70. strings = flatten(strings)
  71. if isinstance(text, six.text_type):
  72. return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
  73. else:
  74. return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
  75. for s in strings]
  76. def md5sum(file):
  77. """Calculate the md5 checksum of a file-like object without reading its
  78. whole content in memory.
  79. >>> from io import BytesIO
  80. >>> md5sum(BytesIO(b'file content to hash'))
  81. '784406af91dd5a54fbb9c84c2236595a'
  82. """
  83. m = hashlib.md5()
  84. while True:
  85. d = file.read(8096)
  86. if not d:
  87. break
  88. m.update(d)
  89. return m.hexdigest()
  90. def rel_has_nofollow(rel):
  91. """Return True if link rel attribute has nofollow type"""
  92. return rel is not None and 'nofollow' in rel.split()
  93. def create_instance(objcls, settings, crawler, *args, **kwargs):
  94. """Construct a class instance using its ``from_crawler`` or
  95. ``from_settings`` constructors, if available.
  96. At least one of ``settings`` and ``crawler`` needs to be different from
  97. ``None``. If ``settings `` is ``None``, ``crawler.settings`` will be used.
  98. If ``crawler`` is ``None``, only the ``from_settings`` constructor will be
  99. tried.
  100. ``*args`` and ``**kwargs`` are forwarded to the constructors.
  101. Raises ``ValueError`` if both ``settings`` and ``crawler`` are ``None``.
  102. """
  103. if settings is None:
  104. if crawler is None:
  105. raise ValueError("Specifiy at least one of settings and crawler.")
  106. settings = crawler.settings
  107. if crawler and hasattr(objcls, 'from_crawler'):
  108. return objcls.from_crawler(crawler, *args, **kwargs)
  109. elif hasattr(objcls, 'from_settings'):
  110. return objcls.from_settings(settings, *args, **kwargs)
  111. else:
  112. return objcls(*args, **kwargs)
  113. @contextmanager
  114. def set_environ(**kwargs):
  115. """Temporarily set environment variables inside the context manager and
  116. fully restore previous environment afterwards
  117. """
  118. original_env = {k: os.environ.get(k) for k in kwargs}
  119. os.environ.update(kwargs)
  120. try:
  121. yield
  122. finally:
  123. for k, v in original_env.items():
  124. if v is None:
  125. del os.environ[k]
  126. else:
  127. os.environ[k] = v