genspider.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. from __future__ import print_function
  2. import os
  3. import shutil
  4. import string
  5. from importlib import import_module
  6. from os.path import join, dirname, abspath, exists, splitext
  7. import scrapy
  8. from scrapy.commands import ScrapyCommand
  9. from scrapy.utils.template import render_templatefile, string_camelcase
  10. from scrapy.exceptions import UsageError
  11. def sanitize_module_name(module_name):
  12. """Sanitize the given module name, by replacing dashes and points
  13. with underscores and prefixing it with a letter if it doesn't start
  14. with one
  15. """
  16. module_name = module_name.replace('-', '_').replace('.', '_')
  17. if module_name[0] not in string.ascii_letters:
  18. module_name = "a" + module_name
  19. return module_name
  20. class Command(ScrapyCommand):
  21. requires_project = False
  22. default_settings = {'LOG_ENABLED': False}
  23. def syntax(self):
  24. return "[options] <name> <domain>"
  25. def short_desc(self):
  26. return "Generate new spider using pre-defined templates"
  27. def add_options(self, parser):
  28. ScrapyCommand.add_options(self, parser)
  29. parser.add_option("-l", "--list", dest="list", action="store_true",
  30. help="List available templates")
  31. parser.add_option("-e", "--edit", dest="edit", action="store_true",
  32. help="Edit spider after creating it")
  33. parser.add_option("-d", "--dump", dest="dump", metavar="TEMPLATE",
  34. help="Dump template to standard output")
  35. parser.add_option("-t", "--template", dest="template", default="basic",
  36. help="Uses a custom template.")
  37. parser.add_option("--force", dest="force", action="store_true",
  38. help="If the spider already exists, overwrite it with the template")
  39. def run(self, args, opts):
  40. if opts.list:
  41. self._list_templates()
  42. return
  43. if opts.dump:
  44. template_file = self._find_template(opts.dump)
  45. if template_file:
  46. with open(template_file, "r") as f:
  47. print(f.read())
  48. return
  49. if len(args) != 2:
  50. raise UsageError()
  51. name, domain = args[0:2]
  52. module = sanitize_module_name(name)
  53. if self.settings.get('BOT_NAME') == module:
  54. print("Cannot create a spider with the same name as your project")
  55. return
  56. try:
  57. spidercls = self.crawler_process.spider_loader.load(name)
  58. except KeyError:
  59. pass
  60. else:
  61. # if spider already exists and not --force then halt
  62. if not opts.force:
  63. print("Spider %r already exists in module:" % name)
  64. print(" %s" % spidercls.__module__)
  65. return
  66. template_file = self._find_template(opts.template)
  67. if template_file:
  68. self._genspider(module, name, domain, opts.template, template_file)
  69. if opts.edit:
  70. self.exitcode = os.system('scrapy edit "%s"' % name)
  71. def _genspider(self, module, name, domain, template_name, template_file):
  72. """Generate the spider module, based on the given template"""
  73. tvars = {
  74. 'project_name': self.settings.get('BOT_NAME'),
  75. 'ProjectName': string_camelcase(self.settings.get('BOT_NAME')),
  76. 'module': module,
  77. 'name': name,
  78. 'domain': domain,
  79. 'classname': '%sSpider' % ''.join(s.capitalize() \
  80. for s in module.split('_'))
  81. }
  82. if self.settings.get('NEWSPIDER_MODULE'):
  83. spiders_module = import_module(self.settings['NEWSPIDER_MODULE'])
  84. spiders_dir = abspath(dirname(spiders_module.__file__))
  85. else:
  86. spiders_module = None
  87. spiders_dir = "."
  88. spider_file = "%s.py" % join(spiders_dir, module)
  89. shutil.copyfile(template_file, spider_file)
  90. render_templatefile(spider_file, **tvars)
  91. print("Created spider %r using template %r " % (name, \
  92. template_name), end=('' if spiders_module else '\n'))
  93. if spiders_module:
  94. print("in module:\n %s.%s" % (spiders_module.__name__, module))
  95. def _find_template(self, template):
  96. template_file = join(self.templates_dir, '%s.tmpl' % template)
  97. if exists(template_file):
  98. return template_file
  99. print("Unable to find template: %s\n" % template)
  100. print('Use "scrapy genspider --list" to see all available templates.')
  101. def _list_templates(self):
  102. print("Available templates:")
  103. for filename in sorted(os.listdir(self.templates_dir)):
  104. if filename.endswith('.tmpl'):
  105. print(" %s" % splitext(filename)[0])
  106. @property
  107. def templates_dir(self):
  108. _templates_base_dir = self.settings['TEMPLATES_DIR'] or \
  109. join(scrapy.__path__[0], 'templates')
  110. return join(_templates_base_dir, 'spiders')