default_settings.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. """
  2. This module contains the default values for all settings used by Scrapy.
  3. For more information about these settings you can read the settings
  4. documentation in docs/topics/settings.rst
  5. Scrapy developers, if you add a setting here remember to:
  6. * add it in alphabetical order
  7. * group similar settings without leaving blank lines
  8. * add its documentation to the available settings documentation
  9. (docs/topics/settings.rst)
  10. """
  11. import sys
  12. from importlib import import_module
  13. from os.path import join, abspath, dirname
  14. import six
  15. AJAXCRAWL_ENABLED = False
  16. AUTOTHROTTLE_ENABLED = False
  17. AUTOTHROTTLE_DEBUG = False
  18. AUTOTHROTTLE_MAX_DELAY = 60.0
  19. AUTOTHROTTLE_START_DELAY = 5.0
  20. AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  21. BOT_NAME = 'scrapybot'
  22. CLOSESPIDER_TIMEOUT = 0
  23. CLOSESPIDER_PAGECOUNT = 0
  24. CLOSESPIDER_ITEMCOUNT = 0
  25. CLOSESPIDER_ERRORCOUNT = 0
  26. COMMANDS_MODULE = ''
  27. COMPRESSION_ENABLED = True
  28. CONCURRENT_ITEMS = 100
  29. CONCURRENT_REQUESTS = 16
  30. CONCURRENT_REQUESTS_PER_DOMAIN = 8
  31. CONCURRENT_REQUESTS_PER_IP = 0
  32. COOKIES_ENABLED = True
  33. COOKIES_DEBUG = False
  34. DEFAULT_ITEM_CLASS = 'scrapy.item.Item'
  35. DEFAULT_REQUEST_HEADERS = {
  36. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  37. 'Accept-Language': 'en',
  38. }
  39. DEPTH_LIMIT = 0
  40. DEPTH_STATS_VERBOSE = False
  41. DEPTH_PRIORITY = 0
  42. DNSCACHE_ENABLED = True
  43. DNSCACHE_SIZE = 10000
  44. DNS_TIMEOUT = 60
  45. DOWNLOAD_DELAY = 0
  46. DOWNLOAD_HANDLERS = {}
  47. DOWNLOAD_HANDLERS_BASE = {
  48. 'data': 'scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler',
  49. 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
  50. 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
  51. 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
  52. 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
  53. 'ftp': 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler',
  54. }
  55. DOWNLOAD_TIMEOUT = 180 # 3mins
  56. DOWNLOAD_MAXSIZE = 1024*1024*1024 # 1024m
  57. DOWNLOAD_WARNSIZE = 32*1024*1024 # 32m
  58. DOWNLOAD_FAIL_ON_DATALOSS = True
  59. DOWNLOADER = 'scrapy.core.downloader.Downloader'
  60. DOWNLOADER_HTTPCLIENTFACTORY = 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory'
  61. DOWNLOADER_CLIENTCONTEXTFACTORY = 'scrapy.core.downloader.contextfactory.ScrapyClientContextFactory'
  62. DOWNLOADER_CLIENT_TLS_CIPHERS = 'DEFAULT'
  63. DOWNLOADER_CLIENT_TLS_METHOD = 'TLS' # Use highest TLS/SSL protocol version supported by the platform,
  64. # also allowing negotiation
  65. DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING = False
  66. DOWNLOADER_MIDDLEWARES = {}
  67. DOWNLOADER_MIDDLEWARES_BASE = {
  68. # Engine side
  69. 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
  70. 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
  71. 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
  72. 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,
  73. 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,
  74. 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
  75. 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
  76. 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
  77. 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
  78. 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
  79. 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
  80. 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
  81. 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
  82. 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
  83. # Downloader side
  84. }
  85. DOWNLOADER_STATS = True
  86. DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
  87. EDITOR = 'vi'
  88. if sys.platform == 'win32':
  89. EDITOR = '%s -m idlelib.idle'
  90. EXTENSIONS = {}
  91. EXTENSIONS_BASE = {
  92. 'scrapy.extensions.corestats.CoreStats': 0,
  93. 'scrapy.extensions.telnet.TelnetConsole': 0,
  94. 'scrapy.extensions.memusage.MemoryUsage': 0,
  95. 'scrapy.extensions.memdebug.MemoryDebugger': 0,
  96. 'scrapy.extensions.closespider.CloseSpider': 0,
  97. 'scrapy.extensions.feedexport.FeedExporter': 0,
  98. 'scrapy.extensions.logstats.LogStats': 0,
  99. 'scrapy.extensions.spiderstate.SpiderState': 0,
  100. 'scrapy.extensions.throttle.AutoThrottle': 0,
  101. }
  102. FEED_TEMPDIR = None
  103. FEED_URI = None
  104. FEED_URI_PARAMS = None # a function to extend uri arguments
  105. FEED_FORMAT = 'jsonlines'
  106. FEED_STORE_EMPTY = False
  107. FEED_EXPORT_ENCODING = None
  108. FEED_EXPORT_FIELDS = None
  109. FEED_STORAGES = {}
  110. FEED_STORAGES_BASE = {
  111. '': 'scrapy.extensions.feedexport.FileFeedStorage',
  112. 'file': 'scrapy.extensions.feedexport.FileFeedStorage',
  113. 'stdout': 'scrapy.extensions.feedexport.StdoutFeedStorage',
  114. 's3': 'scrapy.extensions.feedexport.S3FeedStorage',
  115. 'ftp': 'scrapy.extensions.feedexport.FTPFeedStorage',
  116. }
  117. FEED_EXPORTERS = {}
  118. FEED_EXPORTERS_BASE = {
  119. 'json': 'scrapy.exporters.JsonItemExporter',
  120. 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter',
  121. 'jl': 'scrapy.exporters.JsonLinesItemExporter',
  122. 'csv': 'scrapy.exporters.CsvItemExporter',
  123. 'xml': 'scrapy.exporters.XmlItemExporter',
  124. 'marshal': 'scrapy.exporters.MarshalItemExporter',
  125. 'pickle': 'scrapy.exporters.PickleItemExporter',
  126. }
  127. FEED_EXPORT_INDENT = 0
  128. FEED_STORAGE_FTP_ACTIVE = False
  129. FEED_STORAGE_S3_ACL = ''
  130. FILES_STORE_S3_ACL = 'private'
  131. FILES_STORE_GCS_ACL = ''
  132. FTP_USER = 'anonymous'
  133. FTP_PASSWORD = 'guest'
  134. FTP_PASSIVE_MODE = True
  135. HTTPCACHE_ENABLED = False
  136. HTTPCACHE_DIR = 'httpcache'
  137. HTTPCACHE_IGNORE_MISSING = False
  138. HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  139. HTTPCACHE_EXPIRATION_SECS = 0
  140. HTTPCACHE_ALWAYS_STORE = False
  141. HTTPCACHE_IGNORE_HTTP_CODES = []
  142. HTTPCACHE_IGNORE_SCHEMES = ['file']
  143. HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []
  144. HTTPCACHE_DBM_MODULE = 'anydbm' if six.PY2 else 'dbm'
  145. HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
  146. HTTPCACHE_GZIP = False
  147. HTTPPROXY_ENABLED = True
  148. HTTPPROXY_AUTH_ENCODING = 'latin-1'
  149. IMAGES_STORE_S3_ACL = 'private'
  150. IMAGES_STORE_GCS_ACL = ''
  151. ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'
  152. ITEM_PIPELINES = {}
  153. ITEM_PIPELINES_BASE = {}
  154. LOG_ENABLED = True
  155. LOG_ENCODING = 'utf-8'
  156. LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
  157. LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
  158. LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'
  159. LOG_STDOUT = False
  160. LOG_LEVEL = 'DEBUG'
  161. LOG_FILE = None
  162. LOG_SHORT_NAMES = False
  163. SCHEDULER_DEBUG = False
  164. LOGSTATS_INTERVAL = 60.0
  165. MAIL_HOST = 'localhost'
  166. MAIL_PORT = 25
  167. MAIL_FROM = 'scrapy@localhost'
  168. MAIL_PASS = None
  169. MAIL_USER = None
  170. MEMDEBUG_ENABLED = False # enable memory debugging
  171. MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown
  172. MEMUSAGE_CHECK_INTERVAL_SECONDS = 60.0
  173. MEMUSAGE_ENABLED = True
  174. MEMUSAGE_LIMIT_MB = 0
  175. MEMUSAGE_NOTIFY_MAIL = []
  176. MEMUSAGE_WARNING_MB = 0
  177. METAREFRESH_ENABLED = True
  178. METAREFRESH_IGNORE_TAGS = ['script', 'noscript']
  179. METAREFRESH_MAXDELAY = 100
  180. NEWSPIDER_MODULE = ''
  181. RANDOMIZE_DOWNLOAD_DELAY = True
  182. REACTOR_THREADPOOL_MAXSIZE = 10
  183. REDIRECT_ENABLED = True
  184. REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
  185. REDIRECT_PRIORITY_ADJUST = +2
  186. REFERER_ENABLED = True
  187. REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'
  188. RETRY_ENABLED = True
  189. RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
  190. RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
  191. RETRY_PRIORITY_ADJUST = -1
  192. ROBOTSTXT_OBEY = False
  193. ROBOTSTXT_PARSER = 'scrapy.robotstxt.ProtegoRobotParser'
  194. ROBOTSTXT_USER_AGENT = None
  195. SCHEDULER = 'scrapy.core.scheduler.Scheduler'
  196. SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
  197. SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
  198. SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
  199. SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
  200. SPIDER_LOADER_WARN_ONLY = False
  201. SPIDER_MIDDLEWARES = {}
  202. SPIDER_MIDDLEWARES_BASE = {
  203. # Engine side
  204. 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
  205. 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,
  206. 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,
  207. 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,
  208. 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,
  209. # Spider side
  210. }
  211. SPIDER_MODULES = []
  212. STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector'
  213. STATS_DUMP = True
  214. STATSMAILER_RCPTS = []
  215. TEMPLATES_DIR = abspath(join(dirname(__file__), '..', 'templates'))
  216. URLLENGTH_LIMIT = 2083
  217. USER_AGENT = 'Scrapy/%s (+https://scrapy.org)' % import_module('scrapy').__version__
  218. TELNETCONSOLE_ENABLED = 1
  219. TELNETCONSOLE_PORT = [6023, 6073]
  220. TELNETCONSOLE_HOST = '127.0.0.1'
  221. TELNETCONSOLE_USERNAME = 'scrapy'
  222. TELNETCONSOLE_PASSWORD = None
  223. SPIDER_CONTRACTS = {}
  224. SPIDER_CONTRACTS_BASE = {
  225. 'scrapy.contracts.default.UrlContract': 1,
  226. 'scrapy.contracts.default.CallbackKeywordArgumentsContract': 1,
  227. 'scrapy.contracts.default.ReturnsContract': 2,
  228. 'scrapy.contracts.default.ScrapesContract': 3,
  229. }