simple.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. from __future__ import absolute_import
  2. try:
  3. from itertools import zip_longest as izip_longest, repeat # pylint: disable=E0611
  4. except ImportError:
  5. from itertools import izip_longest as izip_longest, repeat # pylint: disable=E0611
  6. import logging
  7. import sys
  8. import time
  9. import warnings
  10. from kafka.vendor import six
  11. from kafka.vendor.six.moves import queue # pylint: disable=import-error
  12. from .base import (
  13. Consumer,
  14. FETCH_DEFAULT_BLOCK_TIMEOUT,
  15. AUTO_COMMIT_MSG_COUNT,
  16. AUTO_COMMIT_INTERVAL,
  17. FETCH_MIN_BYTES,
  18. FETCH_BUFFER_SIZE_BYTES,
  19. MAX_FETCH_BUFFER_SIZE_BYTES,
  20. FETCH_MAX_WAIT_TIME,
  21. ITER_TIMEOUT_SECONDS,
  22. NO_MESSAGES_WAIT_TIME_SECONDS
  23. )
  24. from ..common import (
  25. FetchRequestPayload, KafkaError, OffsetRequestPayload,
  26. ConsumerFetchSizeTooSmall,
  27. UnknownTopicOrPartitionError, NotLeaderForPartitionError,
  28. OffsetOutOfRangeError, FailedPayloadsError, check_error
  29. )
  30. from kafka.protocol.message import PartialMessage
  31. log = logging.getLogger(__name__)
  32. class FetchContext(object):
  33. """
  34. Class for managing the state of a consumer during fetch
  35. """
  36. def __init__(self, consumer, block, timeout):
  37. warnings.warn('deprecated - this class will be removed in a future'
  38. ' release', DeprecationWarning)
  39. self.consumer = consumer
  40. self.block = block
  41. if block:
  42. if not timeout:
  43. timeout = FETCH_DEFAULT_BLOCK_TIMEOUT
  44. self.timeout = timeout * 1000
  45. def __enter__(self):
  46. """Set fetch values based on blocking status"""
  47. self.orig_fetch_max_wait_time = self.consumer.fetch_max_wait_time
  48. self.orig_fetch_min_bytes = self.consumer.fetch_min_bytes
  49. if self.block:
  50. self.consumer.fetch_max_wait_time = self.timeout
  51. self.consumer.fetch_min_bytes = 1
  52. else:
  53. self.consumer.fetch_min_bytes = 0
  54. def __exit__(self, type, value, traceback):
  55. """Reset values"""
  56. self.consumer.fetch_max_wait_time = self.orig_fetch_max_wait_time
  57. self.consumer.fetch_min_bytes = self.orig_fetch_min_bytes
  58. class SimpleConsumer(Consumer):
  59. """
  60. A simple consumer implementation that consumes all/specified partitions
  61. for a topic
  62. Arguments:
  63. client: a connected SimpleClient
  64. group: a name for this consumer, used for offset storage and must be unique
  65. If you are connecting to a server that does not support offset
  66. commit/fetch (any prior to 0.8.1.1), then you *must* set this to None
  67. topic: the topic to consume
  68. Keyword Arguments:
  69. partitions: An optional list of partitions to consume the data from
  70. auto_commit: default True. Whether or not to auto commit the offsets
  71. auto_commit_every_n: default 100. How many messages to consume
  72. before a commit
  73. auto_commit_every_t: default 5000. How much time (in milliseconds) to
  74. wait before commit
  75. fetch_size_bytes: number of bytes to request in a FetchRequest
  76. buffer_size: default 4K. Initial number of bytes to tell kafka we
  77. have available. This will double as needed.
  78. max_buffer_size: default 16K. Max number of bytes to tell kafka we have
  79. available. None means no limit.
  80. iter_timeout: default None. How much time (in seconds) to wait for a
  81. message in the iterator before exiting. None means no
  82. timeout, so it will wait forever.
  83. auto_offset_reset: default largest. Reset partition offsets upon
  84. OffsetOutOfRangeError. Valid values are largest and smallest.
  85. Otherwise, do not reset the offsets and raise OffsetOutOfRangeError.
  86. Auto commit details:
  87. If both auto_commit_every_n and auto_commit_every_t are set, they will
  88. reset one another when one is triggered. These triggers simply call the
  89. commit method on this class. A manual call to commit will also reset
  90. these triggers
  91. """
  92. def __init__(self, client, group, topic, auto_commit=True, partitions=None,
  93. auto_commit_every_n=AUTO_COMMIT_MSG_COUNT,
  94. auto_commit_every_t=AUTO_COMMIT_INTERVAL,
  95. fetch_size_bytes=FETCH_MIN_BYTES,
  96. buffer_size=FETCH_BUFFER_SIZE_BYTES,
  97. max_buffer_size=MAX_FETCH_BUFFER_SIZE_BYTES,
  98. iter_timeout=None,
  99. auto_offset_reset='largest'):
  100. warnings.warn('deprecated - this class will be removed in a future'
  101. ' release. Use KafkaConsumer instead.',
  102. DeprecationWarning)
  103. super(SimpleConsumer, self).__init__(
  104. client, group, topic,
  105. partitions=partitions,
  106. auto_commit=auto_commit,
  107. auto_commit_every_n=auto_commit_every_n,
  108. auto_commit_every_t=auto_commit_every_t)
  109. if max_buffer_size is not None and buffer_size > max_buffer_size:
  110. raise ValueError('buffer_size (%d) is greater than '
  111. 'max_buffer_size (%d)' %
  112. (buffer_size, max_buffer_size))
  113. self.buffer_size = buffer_size
  114. self.max_buffer_size = max_buffer_size
  115. self.fetch_max_wait_time = FETCH_MAX_WAIT_TIME
  116. self.fetch_min_bytes = fetch_size_bytes
  117. self.fetch_offsets = self.offsets.copy()
  118. self.iter_timeout = iter_timeout
  119. self.auto_offset_reset = auto_offset_reset
  120. self.queue = queue.Queue()
  121. def __repr__(self):
  122. return '<SimpleConsumer group=%s, topic=%s, partitions=%s>' % \
  123. (self.group, self.topic, str(self.offsets.keys()))
  124. def reset_partition_offset(self, partition):
  125. """Update offsets using auto_offset_reset policy (smallest|largest)
  126. Arguments:
  127. partition (int): the partition for which offsets should be updated
  128. Returns: Updated offset on success, None on failure
  129. """
  130. LATEST = -1
  131. EARLIEST = -2
  132. if self.auto_offset_reset == 'largest':
  133. reqs = [OffsetRequestPayload(self.topic, partition, LATEST, 1)]
  134. elif self.auto_offset_reset == 'smallest':
  135. reqs = [OffsetRequestPayload(self.topic, partition, EARLIEST, 1)]
  136. else:
  137. # Let's raise an reasonable exception type if user calls
  138. # outside of an exception context
  139. if sys.exc_info() == (None, None, None):
  140. raise OffsetOutOfRangeError('Cannot reset partition offsets without a '
  141. 'valid auto_offset_reset setting '
  142. '(largest|smallest)')
  143. # Otherwise we should re-raise the upstream exception
  144. # b/c it typically includes additional data about
  145. # the request that triggered it, and we do not want to drop that
  146. raise # pylint: disable=E0704
  147. # send_offset_request
  148. log.info('Resetting topic-partition offset to %s for %s:%d',
  149. self.auto_offset_reset, self.topic, partition)
  150. try:
  151. (resp, ) = self.client.send_offset_request(reqs)
  152. except KafkaError as e:
  153. log.error('%s sending offset request for %s:%d',
  154. e.__class__.__name__, self.topic, partition)
  155. else:
  156. self.offsets[partition] = resp.offsets[0]
  157. self.fetch_offsets[partition] = resp.offsets[0]
  158. return resp.offsets[0]
  159. def seek(self, offset, whence=None, partition=None):
  160. """
  161. Alter the current offset in the consumer, similar to fseek
  162. Arguments:
  163. offset: how much to modify the offset
  164. whence: where to modify it from, default is None
  165. * None is an absolute offset
  166. * 0 is relative to the earliest available offset (head)
  167. * 1 is relative to the current offset
  168. * 2 is relative to the latest known offset (tail)
  169. partition: modify which partition, default is None.
  170. If partition is None, would modify all partitions.
  171. """
  172. if whence is None: # set an absolute offset
  173. if partition is None:
  174. for tmp_partition in self.offsets:
  175. self.offsets[tmp_partition] = offset
  176. else:
  177. self.offsets[partition] = offset
  178. elif whence == 1: # relative to current position
  179. if partition is None:
  180. for tmp_partition, _offset in self.offsets.items():
  181. self.offsets[tmp_partition] = _offset + offset
  182. else:
  183. self.offsets[partition] += offset
  184. elif whence in (0, 2): # relative to beginning or end
  185. reqs = []
  186. deltas = {}
  187. if partition is None:
  188. # divide the request offset by number of partitions,
  189. # distribute the remained evenly
  190. (delta, rem) = divmod(offset, len(self.offsets))
  191. for tmp_partition, r in izip_longest(self.offsets.keys(),
  192. repeat(1, rem),
  193. fillvalue=0):
  194. deltas[tmp_partition] = delta + r
  195. for tmp_partition in self.offsets.keys():
  196. if whence == 0:
  197. reqs.append(OffsetRequestPayload(self.topic, tmp_partition, -2, 1))
  198. elif whence == 2:
  199. reqs.append(OffsetRequestPayload(self.topic, tmp_partition, -1, 1))
  200. else:
  201. pass
  202. else:
  203. deltas[partition] = offset
  204. if whence == 0:
  205. reqs.append(OffsetRequestPayload(self.topic, partition, -2, 1))
  206. elif whence == 2:
  207. reqs.append(OffsetRequestPayload(self.topic, partition, -1, 1))
  208. else:
  209. pass
  210. resps = self.client.send_offset_request(reqs)
  211. for resp in resps:
  212. self.offsets[resp.partition] = \
  213. resp.offsets[0] + deltas[resp.partition]
  214. else:
  215. raise ValueError('Unexpected value for `whence`, %d' % whence)
  216. # Reset queue and fetch offsets since they are invalid
  217. self.fetch_offsets = self.offsets.copy()
  218. self.count_since_commit += 1
  219. if self.auto_commit:
  220. self.commit()
  221. self.queue = queue.Queue()
  222. def get_messages(self, count=1, block=True, timeout=0.1):
  223. """
  224. Fetch the specified number of messages
  225. Keyword Arguments:
  226. count: Indicates the maximum number of messages to be fetched
  227. block: If True, the API will block till all messages are fetched.
  228. If block is a positive integer the API will block until that
  229. many messages are fetched.
  230. timeout: When blocking is requested the function will block for
  231. the specified time (in seconds) until count messages is
  232. fetched. If None, it will block forever.
  233. """
  234. messages = []
  235. if timeout is not None:
  236. timeout += time.time()
  237. new_offsets = {}
  238. log.debug('getting %d messages', count)
  239. while len(messages) < count:
  240. block_time = timeout - time.time()
  241. log.debug('calling _get_message block=%s timeout=%s', block, block_time)
  242. block_next_call = block is True or block > len(messages)
  243. result = self._get_message(block_next_call, block_time,
  244. get_partition_info=True,
  245. update_offset=False)
  246. log.debug('got %s from _get_messages', result)
  247. if not result:
  248. if block_next_call and (timeout is None or time.time() <= timeout):
  249. continue
  250. break
  251. partition, message = result
  252. _msg = (partition, message) if self.partition_info else message
  253. messages.append(_msg)
  254. new_offsets[partition] = message.offset + 1
  255. # Update and commit offsets if necessary
  256. self.offsets.update(new_offsets)
  257. self.count_since_commit += len(messages)
  258. self._auto_commit()
  259. log.debug('got %d messages: %s', len(messages), messages)
  260. return messages
  261. def get_message(self, block=True, timeout=0.1, get_partition_info=None):
  262. return self._get_message(block, timeout, get_partition_info)
  263. def _get_message(self, block=True, timeout=0.1, get_partition_info=None,
  264. update_offset=True):
  265. """
  266. If no messages can be fetched, returns None.
  267. If get_partition_info is None, it defaults to self.partition_info
  268. If get_partition_info is True, returns (partition, message)
  269. If get_partition_info is False, returns message
  270. """
  271. start_at = time.time()
  272. while self.queue.empty():
  273. # We're out of messages, go grab some more.
  274. log.debug('internal queue empty, fetching more messages')
  275. with FetchContext(self, block, timeout):
  276. self._fetch()
  277. if not block or time.time() > (start_at + timeout):
  278. break
  279. try:
  280. partition, message = self.queue.get_nowait()
  281. if update_offset:
  282. # Update partition offset
  283. self.offsets[partition] = message.offset + 1
  284. # Count, check and commit messages if necessary
  285. self.count_since_commit += 1
  286. self._auto_commit()
  287. if get_partition_info is None:
  288. get_partition_info = self.partition_info
  289. if get_partition_info:
  290. return partition, message
  291. else:
  292. return message
  293. except queue.Empty:
  294. log.debug('internal queue empty after fetch - returning None')
  295. return None
  296. def __iter__(self):
  297. if self.iter_timeout is None:
  298. timeout = ITER_TIMEOUT_SECONDS
  299. else:
  300. timeout = self.iter_timeout
  301. while True:
  302. message = self.get_message(True, timeout)
  303. if message:
  304. yield message
  305. elif self.iter_timeout is None:
  306. # We did not receive any message yet but we don't have a
  307. # timeout, so give up the CPU for a while before trying again
  308. time.sleep(NO_MESSAGES_WAIT_TIME_SECONDS)
  309. else:
  310. # Timed out waiting for a message
  311. break
  312. def _fetch(self):
  313. # Create fetch request payloads for all the partitions
  314. partitions = dict((p, self.buffer_size)
  315. for p in self.fetch_offsets.keys())
  316. while partitions:
  317. requests = []
  318. for partition, buffer_size in six.iteritems(partitions):
  319. requests.append(FetchRequestPayload(self.topic, partition,
  320. self.fetch_offsets[partition],
  321. buffer_size))
  322. # Send request
  323. responses = self.client.send_fetch_request(
  324. requests,
  325. max_wait_time=int(self.fetch_max_wait_time),
  326. min_bytes=self.fetch_min_bytes,
  327. fail_on_error=False
  328. )
  329. retry_partitions = {}
  330. for resp in responses:
  331. try:
  332. check_error(resp)
  333. except UnknownTopicOrPartitionError:
  334. log.error('UnknownTopicOrPartitionError for %s:%d',
  335. resp.topic, resp.partition)
  336. self.client.reset_topic_metadata(resp.topic)
  337. raise
  338. except NotLeaderForPartitionError:
  339. log.error('NotLeaderForPartitionError for %s:%d',
  340. resp.topic, resp.partition)
  341. self.client.reset_topic_metadata(resp.topic)
  342. continue
  343. except OffsetOutOfRangeError:
  344. log.warning('OffsetOutOfRangeError for %s:%d. '
  345. 'Resetting partition offset...',
  346. resp.topic, resp.partition)
  347. self.reset_partition_offset(resp.partition)
  348. # Retry this partition
  349. retry_partitions[resp.partition] = partitions[resp.partition]
  350. continue
  351. except FailedPayloadsError as e:
  352. log.warning('FailedPayloadsError for %s:%d',
  353. e.payload.topic, e.payload.partition)
  354. # Retry this partition
  355. retry_partitions[e.payload.partition] = partitions[e.payload.partition]
  356. continue
  357. partition = resp.partition
  358. buffer_size = partitions[partition]
  359. # Check for partial message
  360. if resp.messages and isinstance(resp.messages[-1].message, PartialMessage):
  361. # If buffer is at max and all we got was a partial message
  362. # raise ConsumerFetchSizeTooSmall
  363. if (self.max_buffer_size is not None and
  364. buffer_size == self.max_buffer_size and
  365. len(resp.messages) == 1):
  366. log.error('Max fetch size %d too small', self.max_buffer_size)
  367. raise ConsumerFetchSizeTooSmall()
  368. if self.max_buffer_size is None:
  369. buffer_size *= 2
  370. else:
  371. buffer_size = min(buffer_size * 2, self.max_buffer_size)
  372. log.warning('Fetch size too small, increase to %d (2x) '
  373. 'and retry', buffer_size)
  374. retry_partitions[partition] = buffer_size
  375. resp.messages.pop()
  376. for message in resp.messages:
  377. if message.offset < self.fetch_offsets[partition]:
  378. log.debug('Skipping message %s because its offset is less than the consumer offset',
  379. message)
  380. continue
  381. # Put the message in our queue
  382. self.queue.put((partition, message))
  383. self.fetch_offsets[partition] = message.offset + 1
  384. partitions = retry_partitions