parse.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. from __future__ import print_function
  2. import json
  3. import logging
  4. from w3lib.url import is_url
  5. from scrapy.commands import ScrapyCommand
  6. from scrapy.http import Request
  7. from scrapy.item import BaseItem
  8. from scrapy.utils import display
  9. from scrapy.utils.conf import arglist_to_dict
  10. from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
  11. from scrapy.exceptions import UsageError
  12. logger = logging.getLogger(__name__)
  13. class Command(ScrapyCommand):
  14. requires_project = True
  15. spider = None
  16. items = {}
  17. requests = {}
  18. first_response = None
  19. def syntax(self):
  20. return "[options] <url>"
  21. def short_desc(self):
  22. return "Parse URL (using its spider) and print the results"
  23. def add_options(self, parser):
  24. ScrapyCommand.add_options(self, parser)
  25. parser.add_option("--spider", dest="spider", default=None,
  26. help="use this spider without looking for one")
  27. parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
  28. help="set spider argument (may be repeated)")
  29. parser.add_option("--pipelines", action="store_true",
  30. help="process items through pipelines")
  31. parser.add_option("--nolinks", dest="nolinks", action="store_true",
  32. help="don't show links to follow (extracted requests)")
  33. parser.add_option("--noitems", dest="noitems", action="store_true",
  34. help="don't show scraped items")
  35. parser.add_option("--nocolour", dest="nocolour", action="store_true",
  36. help="avoid using pygments to colorize the output")
  37. parser.add_option("-r", "--rules", dest="rules", action="store_true",
  38. help="use CrawlSpider rules to discover the callback")
  39. parser.add_option("-c", "--callback", dest="callback",
  40. help="use this callback for parsing, instead looking for a callback")
  41. parser.add_option("-m", "--meta", dest="meta",
  42. help="inject extra meta into the Request, it must be a valid raw json string")
  43. parser.add_option("--cbkwargs", dest="cbkwargs",
  44. help="inject extra callback kwargs into the Request, it must be a valid raw json string")
  45. parser.add_option("-d", "--depth", dest="depth", type="int", default=1,
  46. help="maximum depth for parsing requests [default: %default]")
  47. parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
  48. help="print each depth level one by one")
  49. @property
  50. def max_level(self):
  51. max_items, max_requests = 0, 0
  52. if self.items:
  53. max_items = max(self.items)
  54. if self.requests:
  55. max_requests = max(self.requests)
  56. return max(max_items, max_requests)
  57. def add_items(self, lvl, new_items):
  58. old_items = self.items.get(lvl, [])
  59. self.items[lvl] = old_items + new_items
  60. def add_requests(self, lvl, new_reqs):
  61. old_reqs = self.requests.get(lvl, [])
  62. self.requests[lvl] = old_reqs + new_reqs
  63. def print_items(self, lvl=None, colour=True):
  64. if lvl is None:
  65. items = [item for lst in self.items.values() for item in lst]
  66. else:
  67. items = self.items.get(lvl, [])
  68. print("# Scraped Items ", "-"*60)
  69. display.pprint([dict(x) for x in items], colorize=colour)
  70. def print_requests(self, lvl=None, colour=True):
  71. if lvl is None:
  72. if self.requests:
  73. requests = self.requests[max(self.requests)]
  74. else:
  75. requests = []
  76. else:
  77. requests = self.requests.get(lvl, [])
  78. print("# Requests ", "-"*65)
  79. display.pprint(requests, colorize=colour)
  80. def print_results(self, opts):
  81. colour = not opts.nocolour
  82. if opts.verbose:
  83. for level in range(1, self.max_level+1):
  84. print('\n>>> DEPTH LEVEL: %s <<<' % level)
  85. if not opts.noitems:
  86. self.print_items(level, colour)
  87. if not opts.nolinks:
  88. self.print_requests(level, colour)
  89. else:
  90. print('\n>>> STATUS DEPTH LEVEL %s <<<' % self.max_level)
  91. if not opts.noitems:
  92. self.print_items(colour=colour)
  93. if not opts.nolinks:
  94. self.print_requests(colour=colour)
  95. def run_callback(self, response, callback, cb_kwargs=None):
  96. cb_kwargs = cb_kwargs or {}
  97. items, requests = [], []
  98. for x in iterate_spider_output(callback(response, **cb_kwargs)):
  99. if isinstance(x, (BaseItem, dict)):
  100. items.append(x)
  101. elif isinstance(x, Request):
  102. requests.append(x)
  103. return items, requests
  104. def get_callback_from_rules(self, spider, response):
  105. if getattr(spider, 'rules', None):
  106. for rule in spider.rules:
  107. if rule.link_extractor.matches(response.url):
  108. return rule.callback or "parse"
  109. else:
  110. logger.error('No CrawlSpider rules found in spider %(spider)r, '
  111. 'please specify a callback to use for parsing',
  112. {'spider': spider.name})
  113. def set_spidercls(self, url, opts):
  114. spider_loader = self.crawler_process.spider_loader
  115. if opts.spider:
  116. try:
  117. self.spidercls = spider_loader.load(opts.spider)
  118. except KeyError:
  119. logger.error('Unable to find spider: %(spider)s',
  120. {'spider': opts.spider})
  121. else:
  122. self.spidercls = spidercls_for_request(spider_loader, Request(url))
  123. if not self.spidercls:
  124. logger.error('Unable to find spider for: %(url)s', {'url': url})
  125. # Request requires callback argument as callable or None, not string
  126. request = Request(url, None)
  127. _start_requests = lambda s: [self.prepare_request(s, request, opts)]
  128. self.spidercls.start_requests = _start_requests
  129. def start_parsing(self, url, opts):
  130. self.crawler_process.crawl(self.spidercls, **opts.spargs)
  131. self.pcrawler = list(self.crawler_process.crawlers)[0]
  132. self.crawler_process.start()
  133. if not self.first_response:
  134. logger.error('No response downloaded for: %(url)s',
  135. {'url': url})
  136. def prepare_request(self, spider, request, opts):
  137. def callback(response, **cb_kwargs):
  138. # memorize first request
  139. if not self.first_response:
  140. self.first_response = response
  141. # determine real callback
  142. cb = response.meta['_callback']
  143. if not cb:
  144. if opts.callback:
  145. cb = opts.callback
  146. elif opts.rules and self.first_response == response:
  147. cb = self.get_callback_from_rules(spider, response)
  148. if not cb:
  149. logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s',
  150. {'url': response.url, 'spider': spider.name})
  151. return
  152. else:
  153. cb = 'parse'
  154. if not callable(cb):
  155. cb_method = getattr(spider, cb, None)
  156. if callable(cb_method):
  157. cb = cb_method
  158. else:
  159. logger.error('Cannot find callback %(callback)r in spider: %(spider)s',
  160. {'callback': cb, 'spider': spider.name})
  161. return
  162. # parse items and requests
  163. depth = response.meta['_depth']
  164. items, requests = self.run_callback(response, cb, cb_kwargs)
  165. if opts.pipelines:
  166. itemproc = self.pcrawler.engine.scraper.itemproc
  167. for item in items:
  168. itemproc.process_item(item, spider)
  169. self.add_items(depth, items)
  170. self.add_requests(depth, requests)
  171. if depth < opts.depth:
  172. for req in requests:
  173. req.meta['_depth'] = depth + 1
  174. req.meta['_callback'] = req.callback
  175. req.callback = callback
  176. return requests
  177. # update request meta if any extra meta was passed through the --meta/-m opts.
  178. if opts.meta:
  179. request.meta.update(opts.meta)
  180. # update cb_kwargs if any extra values were was passed through the --cbkwargs option.
  181. if opts.cbkwargs:
  182. request.cb_kwargs.update(opts.cbkwargs)
  183. request.meta['_depth'] = 1
  184. request.meta['_callback'] = request.callback
  185. request.callback = callback
  186. return request
  187. def process_options(self, args, opts):
  188. ScrapyCommand.process_options(self, args, opts)
  189. self.process_spider_arguments(opts)
  190. self.process_request_meta(opts)
  191. self.process_request_cb_kwargs(opts)
  192. def process_spider_arguments(self, opts):
  193. try:
  194. opts.spargs = arglist_to_dict(opts.spargs)
  195. except ValueError:
  196. raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
  197. def process_request_meta(self, opts):
  198. if opts.meta:
  199. try:
  200. opts.meta = json.loads(opts.meta)
  201. except ValueError:
  202. raise UsageError("Invalid -m/--meta value, pass a valid json string to -m or --meta. "
  203. "Example: --meta='{\"foo\" : \"bar\"}'", print_help=False)
  204. def process_request_cb_kwargs(self, opts):
  205. if opts.cbkwargs:
  206. try:
  207. opts.cbkwargs = json.loads(opts.cbkwargs)
  208. except ValueError:
  209. raise UsageError("Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
  210. "Example: --cbkwargs='{\"foo\" : \"bar\"}'", print_help=False)
  211. def run(self, args, opts):
  212. # parse arguments
  213. if not len(args) == 1 or not is_url(args[0]):
  214. raise UsageError()
  215. else:
  216. url = args[0]
  217. # prepare spidercls
  218. self.set_spidercls(url, opts)
  219. if self.spidercls and opts.depth > 0:
  220. self.start_parsing(url, opts)
  221. self.print_results(opts)