protego.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. import logging
  2. import re
  3. from collections import namedtuple
  4. from datetime import time
  5. import six
  6. from six.moves.urllib.parse import (ParseResult, quote, urlparse,
  7. urlunparse)
  8. logger = logging.getLogger(__name__)
  9. _Rule = namedtuple('Rule', ['field', 'value'])
  10. RequestRate = namedtuple(
  11. 'RequestRate', ['requests', 'seconds', 'start_time', 'end_time'])
  12. _DISALLOW_DIRECTIVE = {'disallow', 'dissallow', 'dissalow', 'disalow', 'diasllow', 'disallaw'}
  13. _ALLOW_DIRECTIVE = {'allow'}
  14. _USER_AGENT_DIRECTIVE = {'user-agent', 'useragent', 'user agent'}
  15. _SITEMAP_DIRECTIVE = {'sitemap', 'sitemaps', 'site-map'}
  16. _CRAWL_DELAY_DIRECTIVE = {'crawl-delay', 'crawl delay'}
  17. _REQUEST_RATE_DIRECTIVE = {'request-rate', 'request rate'}
  18. _HOST_DIRECTIVE = {'host'}
  19. _WILDCARDS = {'*', '$'}
  20. _HEX_DIGITS = set('0123456789ABCDEFabcdef')
  21. __all__ = ['RequestRate', 'Protego']
  22. def _is_valid_directive_field(field):
  23. return any([field in _DISALLOW_DIRECTIVE,
  24. field in _ALLOW_DIRECTIVE,
  25. field in _USER_AGENT_DIRECTIVE,
  26. field in _SITEMAP_DIRECTIVE,
  27. field in _CRAWL_DELAY_DIRECTIVE,
  28. field in _REQUEST_RATE_DIRECTIVE,
  29. field in _HOST_DIRECTIVE])
  30. def _enforce_path(pattern):
  31. if pattern.startswith('/'):
  32. return pattern
  33. return '/' + pattern
  34. class _URLPattern(object):
  35. """Internal class which represents a URL pattern."""
  36. def __init__(self, pattern):
  37. self._pattern = pattern
  38. self.priority = len(pattern)
  39. self._contains_asterisk = '*' in self._pattern
  40. self._contains_dollar = self._pattern.endswith('$')
  41. if self._contains_asterisk:
  42. self._pattern_before_asterisk = self._pattern[:self._pattern.find('*')]
  43. elif self._contains_dollar:
  44. self._pattern_before_dollar = self._pattern[:-1]
  45. self._pattern_compiled = False
  46. def match(self, url):
  47. """Retun True if pattern matches the given URL, otherwise return False."""
  48. # check if pattern is already compiled
  49. if self._pattern_compiled:
  50. return self._pattern.match(url)
  51. if not self._contains_asterisk:
  52. if not self._contains_dollar:
  53. # answer directly for patterns without wildcards
  54. return url.startswith(self._pattern)
  55. # pattern only contains $ wildcard.
  56. return url == self._pattern_before_dollar
  57. if not url.startswith(self._pattern_before_asterisk):
  58. return False
  59. self._pattern = self._prepare_pattern_for_regex(self._pattern)
  60. self._pattern = re.compile(self._pattern)
  61. self._pattern_compiled = True
  62. return self._pattern.match(url)
  63. def _prepare_pattern_for_regex(self, pattern):
  64. """Return equivalent regex pattern for the given URL pattern."""
  65. pattern = re.sub(r'\*+', '*', pattern)
  66. s = re.split(r'(\*|\$$)', pattern)
  67. for index, substr in enumerate(s):
  68. if substr not in _WILDCARDS:
  69. s[index] = re.escape(substr)
  70. elif s[index] == '*':
  71. s[index] = '.*?'
  72. pattern = ''.join(s)
  73. return pattern
  74. class _RuleSet(object):
  75. """Internal class which stores rules for a user agent."""
  76. def __init__(self, parser_instance):
  77. self.user_agent = None
  78. self._rules = []
  79. self._crawl_delay = None
  80. self._req_rate = None
  81. self._parser_instance = parser_instance
  82. def applies_to(self, robotname):
  83. """Return matching score."""
  84. robotname = robotname.strip().lower()
  85. if self.user_agent == '*':
  86. return 1
  87. if self.user_agent in robotname:
  88. return len(self.user_agent)
  89. return 0
  90. def _unquote(self, url, ignore='', errors='replace'):
  91. """Replace %xy escapes by their single-character equivalent."""
  92. if '%' not in url:
  93. return url
  94. def hex_to_byte(h):
  95. """Replaces a %xx escape with equivalent binary sequence."""
  96. if six.PY2:
  97. return chr(int(h, 16))
  98. return bytes.fromhex(h)
  99. # ignore contains %xy escapes for characters that are not
  100. # meant to be converted back.
  101. ignore = {'{:02X}'.format(ord(c)) for c in ignore}
  102. parts = url.split('%')
  103. parts[0] = parts[0].encode('utf-8')
  104. for i in range(1, len(parts)):
  105. if len(parts[i]) >= 2:
  106. # %xy is a valid escape only if x and y are hexadecimal digits.
  107. if set(parts[i][:2]).issubset(_HEX_DIGITS):
  108. # make sure that all %xy escapes are in uppercase.
  109. hexcode = parts[i][:2].upper()
  110. leftover = parts[i][2:]
  111. if hexcode not in ignore:
  112. parts[i] = hex_to_byte(hexcode) + leftover.encode('utf-8')
  113. continue
  114. else:
  115. parts[i] = hexcode + leftover
  116. # add back the '%' we removed during splitting.
  117. parts[i] = b'%' + parts[i].encode('utf-8')
  118. return b''.join(parts).decode('utf-8', errors)
  119. def hexescape(self, char):
  120. """Escape char as RFC 2396 specifies"""
  121. hex_repr = hex(ord(char))[2:].upper()
  122. if len(hex_repr) == 1:
  123. hex_repr = "0%s" % hex_repr
  124. return "%" + hex_repr
  125. def _quote_path(self, path):
  126. """Return percent encoded path."""
  127. parts = urlparse(path)
  128. path = self._unquote(parts.path, ignore='/%')
  129. # quote do not work with unicode strings in Python 2.7
  130. if six.PY2:
  131. path = quote(path.encode('utf-8'), safe='/%')
  132. else:
  133. path = quote(path, safe='/%')
  134. parts = ParseResult('', '', path, parts.params, parts.query, parts.fragment)
  135. path = urlunparse(parts)
  136. return path
  137. def _quote_pattern(self, pattern):
  138. # Corner case for query only (e.g. '/abc?') and param only (e.g. '/abc;') URLs.
  139. # Save the last character otherwise, urlparse will kill it.
  140. last_char = ''
  141. if pattern[-1] == '?' or pattern[-1] == ';' or pattern[-1] == '$':
  142. last_char = pattern[-1]
  143. pattern = pattern[:-1]
  144. parts = urlparse(pattern)
  145. pattern = self._unquote(parts.path, ignore='/*$%')
  146. # quote do not work with unicode strings in Python 2.7
  147. if six.PY2:
  148. pattern = quote(pattern.encode('utf-8'), safe='/*%')
  149. else:
  150. pattern = quote(pattern, safe='/*%')
  151. parts = ParseResult('', '', pattern + last_char, parts.params, parts.query, parts.fragment)
  152. pattern = urlunparse(parts)
  153. return pattern
  154. def allow(self, pattern):
  155. if '$' in pattern:
  156. self.allow(pattern.replace('$', self.hexescape('$')))
  157. pattern = self._quote_pattern(pattern)
  158. if not pattern:
  159. return
  160. self._rules.append(_Rule(field='allow', value=_URLPattern(pattern)))
  161. # If index.html is allowed, we interpret this as / being allowed too.
  162. if pattern.endswith('/index.html'):
  163. self.allow(pattern[:-10] + '$')
  164. def disallow(self, pattern):
  165. if '$' in pattern:
  166. self.disallow(pattern.replace('$', self.hexescape('$')))
  167. pattern = self._quote_pattern(pattern)
  168. if not pattern:
  169. return
  170. self._rules.append(_Rule(field='disallow', value=_URLPattern(pattern)))
  171. def finalize_rules(self):
  172. self._rules.sort(key=lambda r: (r.value.priority, r.field == 'allow'), reverse=True)
  173. def can_fetch(self, url):
  174. """Return if the url can be fetched."""
  175. url = self._quote_path(url)
  176. allowed = True
  177. for rule in self._rules:
  178. if rule.value.match(url):
  179. if rule.field == 'disallow':
  180. allowed = False
  181. break
  182. return allowed
  183. @property
  184. def crawl_delay(self):
  185. """Get & set crawl delay for the rule set."""
  186. return self._crawl_delay
  187. @crawl_delay.setter
  188. def crawl_delay(self, delay):
  189. try:
  190. delay = float(delay)
  191. except ValueError:
  192. # Value is malformed, do nothing.
  193. logger.debug("Malformed rule at line {} : cannot set crawl delay to '{}'. "
  194. "Ignoring this rule.".format(self._parser_instance._total_line_seen, delay))
  195. return
  196. self._crawl_delay = delay
  197. @property
  198. def request_rate(self):
  199. """Get & set request rate for the rule set."""
  200. return self._req_rate
  201. @request_rate.setter
  202. def request_rate(self, value):
  203. try:
  204. parts = value.split()
  205. if len(parts) == 2:
  206. rate, time_period = parts
  207. else:
  208. rate, time_period = parts[0], ''
  209. requests, seconds = rate.split('/')
  210. time_unit = seconds[-1].lower()
  211. requests, seconds = int(requests), int(seconds[:-1])
  212. if time_unit == 'm':
  213. seconds *= 60
  214. elif time_unit == 'h':
  215. seconds *= 3600
  216. elif time_unit == 'd':
  217. seconds *= 86400
  218. start_time = None
  219. end_time = None
  220. if time_period:
  221. start_time, end_time = time_period.split('-')
  222. start_time = time(int(start_time[:2]), int(start_time[-2:]))
  223. end_time = time(int(end_time[:2]), int(end_time[-2:]))
  224. except Exception:
  225. # Value is malformed, do nothing.
  226. logger.debug("Malformed rule at line {} : cannot set request rate using '{}'. "
  227. "Ignoring this rule.".format(self._parser_instance._total_line_seen, value))
  228. return
  229. self._req_rate = RequestRate(requests, seconds, start_time, end_time)
  230. class Protego(object):
  231. def __init__(self):
  232. # A dict mapping user agents (specified in robots.txt) to rule sets.
  233. self._user_agents = {}
  234. # Preferred host specified in the robots.txt
  235. self._host = None
  236. # A list of sitemaps specified in the robots.txt
  237. self._sitemap_list = []
  238. # A memoization table mapping user agents (used in queries) to matched rule sets.
  239. self._matched_rule_set = {}
  240. self._total_line_seen = 0
  241. self._invalid_directive_seen = 0
  242. self._total_directive_seen = 0
  243. @classmethod
  244. def parse(cls, content):
  245. o = cls()
  246. o._parse_robotstxt(content)
  247. return o
  248. def _parse_robotstxt(self, content):
  249. lines = content.splitlines()
  250. # A list containing rule sets corresponding to user
  251. # agents of the current record group.
  252. current_rule_sets = []
  253. # Last encountered rule irrespective of whether it was valid or not.
  254. previous_rule_field = None
  255. for line in lines:
  256. self._total_line_seen += 1
  257. # Remove the comment portion of the line
  258. hash_pos = line.find('#')
  259. if hash_pos != -1:
  260. line = line[0: hash_pos].strip()
  261. # Whitespace at the beginning and at the end of the line is ignored.
  262. line = line.strip()
  263. if not line:
  264. continue
  265. # Format for a valid robots.txt rule is "<field>:<value>"
  266. if line.find(':') != -1:
  267. field, value = line.split(':', 1)
  268. else:
  269. # We will be generous here and give it a second chance.
  270. parts = line.split(' ')
  271. if len(parts) < 2:
  272. continue
  273. possible_filed = parts[0]
  274. for i in range(1, len(parts)):
  275. if _is_valid_directive_field(possible_filed):
  276. field, value = possible_filed, ' '.join(parts[i:])
  277. break
  278. possible_filed += ' ' + parts[i]
  279. else:
  280. continue
  281. field = field.strip().lower()
  282. value = value.strip()
  283. # Ignore rules with no value part (e.g. "Disallow: ", "Allow: ").
  284. if not value:
  285. previous_rule_field = field
  286. continue
  287. # Ignore rules without a corresponding user agent.
  288. if not current_rule_sets and field not in _USER_AGENT_DIRECTIVE:
  289. logger.debug("Rule at line {} without any user agent to enforce it on.".format(self._total_line_seen))
  290. continue
  291. self._total_directive_seen += 1
  292. if field in _USER_AGENT_DIRECTIVE:
  293. if previous_rule_field and previous_rule_field not in _USER_AGENT_DIRECTIVE:
  294. current_rule_sets = []
  295. # Wildcards are not supported in the user agent values.
  296. # We will be generous here and remove all the wildcards.
  297. user_agent = value.strip().lower()
  298. user_agent_without_asterisk = None
  299. if user_agent != '*' and '*' in user_agent:
  300. user_agent_without_asterisk = user_agent.replace('*', '')
  301. for user_agent in [user_agent, user_agent_without_asterisk]:
  302. if not user_agent:
  303. continue
  304. # See if this user agent is encountered before, if so merge these rules into it.
  305. rule_set = self._user_agents.get(user_agent, None)
  306. if rule_set and rule_set not in current_rule_sets:
  307. current_rule_sets.append(rule_set)
  308. if not rule_set:
  309. rule_set = _RuleSet(self)
  310. rule_set.user_agent = user_agent
  311. self._user_agents[user_agent] = rule_set
  312. current_rule_sets.append(rule_set)
  313. elif field in _ALLOW_DIRECTIVE:
  314. for rule_set in current_rule_sets:
  315. rule_set.allow(_enforce_path(value))
  316. elif field in _DISALLOW_DIRECTIVE:
  317. for rule_set in current_rule_sets:
  318. rule_set.disallow(_enforce_path(value))
  319. elif field in _SITEMAP_DIRECTIVE:
  320. self._sitemap_list.append(value)
  321. elif field in _CRAWL_DELAY_DIRECTIVE:
  322. for rule_set in current_rule_sets:
  323. rule_set.crawl_delay = value
  324. elif field in _REQUEST_RATE_DIRECTIVE:
  325. for rule_set in current_rule_sets:
  326. rule_set.request_rate = value
  327. elif field in _HOST_DIRECTIVE:
  328. self._host = value
  329. else:
  330. self._invalid_directive_seen += 1
  331. previous_rule_field = field
  332. for user_agent in self._user_agents.values():
  333. user_agent.finalize_rules()
  334. def _get_matching_rule_set(self, user_agent):
  335. """Return the rule set with highest matching score."""
  336. if not self._user_agents:
  337. return None
  338. if user_agent in self._matched_rule_set:
  339. return self._matched_rule_set[user_agent]
  340. score_rule_set_pairs = ((rs.applies_to(user_agent), rs) for rs in self._user_agents.values())
  341. match_score, matched_rule_set = max(score_rule_set_pairs, key=lambda p: p[0])
  342. if not match_score:
  343. self._matched_rule_set[user_agent] = None
  344. return None
  345. self._matched_rule_set[user_agent] = matched_rule_set
  346. return matched_rule_set
  347. def can_fetch(self, url, user_agent):
  348. """Return True if the user agent can fetch the URL, otherwise return False."""
  349. matched_rule_set = self._get_matching_rule_set(user_agent)
  350. if not matched_rule_set:
  351. return True
  352. return matched_rule_set.can_fetch(url)
  353. def crawl_delay(self, user_agent):
  354. """Return the crawl delay specified for the user agent as a float.
  355. If nothing is specified, return None.
  356. """
  357. matched_rule_set = self._get_matching_rule_set(user_agent)
  358. if not matched_rule_set:
  359. return None
  360. return matched_rule_set.crawl_delay
  361. def request_rate(self, user_agent):
  362. """Return the request rate specified for the user agent as a named tuple
  363. RequestRate(requests, seconds, start_time, end_time). If nothing is
  364. specified, return None.
  365. """
  366. matched_rule_set = self._get_matching_rule_set(user_agent)
  367. if not matched_rule_set:
  368. return None
  369. return matched_rule_set.request_rate
  370. @property
  371. def sitemaps(self):
  372. """Get an iterator containing links to sitemaps specified."""
  373. return iter(self._sitemap_list)
  374. @property
  375. def preferred_host(self):
  376. """Get the preferred host."""
  377. return self._host
  378. @property
  379. def _valid_directive_seen(self):
  380. return self._total_directive_seen - self._invalid_directive_seen