asynpool.py 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141
  1. # -*- coding: utf-8 -*-
  2. """
  3. celery.concurrency.asynpool
  4. ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  5. .. note::
  6. This module will be moved soon, so don't use it directly.
  7. Non-blocking version of :class:`multiprocessing.Pool`.
  8. This code deals with three major challenges:
  9. 1) Starting up child processes and keeping them running.
  10. 2) Sending jobs to the processes and receiving results back.
  11. 3) Safely shutting down this system.
  12. """
  13. from __future__ import absolute_import
  14. import errno
  15. import os
  16. import random
  17. import select
  18. import socket
  19. import struct
  20. import time
  21. from collections import deque, namedtuple
  22. from io import BytesIO
  23. from pickle import HIGHEST_PROTOCOL
  24. from time import sleep
  25. from weakref import WeakValueDictionary, ref
  26. from amqp.utils import promise
  27. from billiard.pool import RUN, TERMINATE, ACK, NACK, WorkersJoined
  28. from billiard import pool as _pool
  29. from billiard.compat import setblocking, isblocking
  30. from billiard.einfo import ExceptionInfo
  31. from billiard.queues import _SimpleQueue
  32. from kombu.async import READ, WRITE, ERR
  33. from kombu.serialization import pickle as _pickle
  34. from kombu.utils import fxrange
  35. from kombu.utils.compat import get_errno
  36. from kombu.utils.eventio import SELECT_BAD_FD
  37. from celery.five import Counter, items, values
  38. from celery.utils.log import get_logger
  39. from celery.utils.text import truncate
  40. from celery.worker import state as worker_state
  41. logger = get_logger(__name__)
  42. error, debug = logger.error, logger.debug
  43. UNAVAIL = frozenset([errno.EAGAIN, errno.EINTR])
  44. #: Constant sent by child process when started (ready to accept work)
  45. WORKER_UP = 15
  46. SCHED_STRATEGY_PREFETCH = 1
  47. SCHED_STRATEGY_FAIR = 4
  48. SCHED_STRATEGIES = {
  49. None: SCHED_STRATEGY_PREFETCH,
  50. 'fair': SCHED_STRATEGY_FAIR,
  51. }
  52. Ack = namedtuple('Ack', ('id', 'fd', 'payload'))
  53. def gen_not_started(gen):
  54. # gi_frame is None when generator stopped.
  55. return gen.gi_frame and gen.gi_frame.f_lasti == -1
  56. def _get_job_writer(job):
  57. try:
  58. writer = job._writer
  59. except AttributeError:
  60. pass
  61. else:
  62. return writer() # is a weakref
  63. def _select(readers=None, writers=None, err=None, timeout=0):
  64. """Simple wrapper to :class:`~select.select`.
  65. :param readers: Set of reader fds to test if readable.
  66. :param writers: Set of writer fds to test if writable.
  67. :param err: Set of fds to test for error condition.
  68. All fd sets passed must be mutable as this function
  69. will remove non-working fds from them, this also means
  70. the caller must make sure there are still fds in the sets
  71. before calling us again.
  72. :returns: tuple of ``(readable, writable, again)``, where
  73. ``readable`` is a set of fds that have data available for read,
  74. ``writable`` is a set of fds that is ready to be written to
  75. and ``again`` is a flag that if set means the caller must
  76. throw away the result and call us again.
  77. """
  78. readers = set() if readers is None else readers
  79. writers = set() if writers is None else writers
  80. err = set() if err is None else err
  81. try:
  82. r, w, e = select.select(readers, writers, err, timeout)
  83. if e:
  84. r = list(set(r) | set(e))
  85. return r, w, 0
  86. except (select.error, socket.error) as exc:
  87. if get_errno(exc) == errno.EINTR:
  88. return [], [], 1
  89. elif get_errno(exc) in SELECT_BAD_FD:
  90. for fd in readers | writers | err:
  91. try:
  92. select.select([fd], [], [], 0)
  93. except (select.error, socket.error) as exc:
  94. if get_errno(exc) not in SELECT_BAD_FD:
  95. raise
  96. readers.discard(fd)
  97. writers.discard(fd)
  98. err.discard(fd)
  99. return [], [], 1
  100. else:
  101. raise
  102. class Worker(_pool.Worker):
  103. """Pool worker process."""
  104. dead = False
  105. def on_loop_start(self, pid):
  106. # our version sends a WORKER_UP message when the process is ready
  107. # to accept work, this will tell the parent that the inqueue fd
  108. # is writable.
  109. self.outq.put((WORKER_UP, (pid, )))
  110. def prepare_result(self, result):
  111. if not isinstance(result, ExceptionInfo):
  112. return truncate(repr(result), 46)
  113. class ResultHandler(_pool.ResultHandler):
  114. """Handles messages from the pool processes."""
  115. def __init__(self, *args, **kwargs):
  116. self.fileno_to_outq = kwargs.pop('fileno_to_outq')
  117. self.on_process_alive = kwargs.pop('on_process_alive')
  118. super(ResultHandler, self).__init__(*args, **kwargs)
  119. # add our custom message handler
  120. self.state_handlers[WORKER_UP] = self.on_process_alive
  121. def _recv_message(self, add_reader, fd, callback,
  122. read=os.read, unpack=struct.unpack,
  123. loads=_pickle.loads, BytesIO=BytesIO):
  124. buf = BytesIO()
  125. # header
  126. remaining = 4
  127. bsize = None
  128. assert not isblocking(fd)
  129. while remaining > 0:
  130. try:
  131. bsize = read(fd, remaining)
  132. except OSError as exc:
  133. if get_errno(exc) not in UNAVAIL:
  134. raise
  135. yield
  136. else:
  137. n = len(bsize)
  138. if n == 0:
  139. if remaining == 4:
  140. raise EOFError()
  141. else:
  142. raise OSError("Got end of file during message")
  143. remaining -= n
  144. remaining, = size, = unpack('>i', bsize)
  145. while remaining > 0:
  146. try:
  147. chunk = read(fd, remaining)
  148. except OSError as exc:
  149. if get_errno(exc) not in UNAVAIL:
  150. raise
  151. yield
  152. n = len(chunk)
  153. if n == 0:
  154. if remaining == size:
  155. raise EOFError()
  156. else:
  157. raise IOError('Got end of file during message')
  158. buf.write(chunk)
  159. remaining -= n
  160. add_reader(fd, self.handle_event, fd)
  161. message = loads(buf.getvalue())
  162. if message:
  163. callback(message)
  164. def _make_process_result(self, hub):
  165. """Coroutine that reads messages from the pool processes
  166. and calls the appropriate handler."""
  167. fileno_to_outq = self.fileno_to_outq
  168. on_state_change = self.on_state_change
  169. add_reader = hub.add_reader
  170. hub_remove = hub.remove
  171. recv_message = self._recv_message
  172. def on_result_readable(fileno):
  173. try:
  174. fileno_to_outq[fileno]
  175. except KeyError: # process gone
  176. return hub_remove(fileno)
  177. it = recv_message(add_reader, fileno, on_state_change)
  178. try:
  179. next(it)
  180. except StopIteration:
  181. pass
  182. except (IOError, OSError, EOFError):
  183. hub_remove(fileno)
  184. else:
  185. add_reader(fileno, it)
  186. return on_result_readable
  187. def register_with_event_loop(self, hub):
  188. self.handle_event = self._make_process_result(hub)
  189. def handle_event(self, fileno):
  190. raise RuntimeError('Not registered with event loop')
  191. def on_stop_not_started(self):
  192. """This method is always used to stop when the helper thread is not
  193. started."""
  194. cache = self.cache
  195. check_timeouts = self.check_timeouts
  196. fileno_to_outq = self.fileno_to_outq
  197. on_state_change = self.on_state_change
  198. join_exited_workers = self.join_exited_workers
  199. # flush the processes outqueues until they have all terminated.
  200. outqueues = set(fileno_to_outq)
  201. while cache and outqueues and self._state != TERMINATE:
  202. if check_timeouts is not None:
  203. # make sure tasks with a time limit will time out.
  204. check_timeouts()
  205. for fd in outqueues:
  206. try:
  207. proc = fileno_to_outq[fd]
  208. except KeyError:
  209. # process already found terminated
  210. # which means its outqueue has already been processed
  211. # by the worker lost handler.
  212. outqueues.discard(fd)
  213. continue
  214. reader = proc.outq._reader
  215. try:
  216. setblocking(reader, 1)
  217. except (OSError, IOError):
  218. outqueues.discard(fd)
  219. continue
  220. try:
  221. if reader.poll(0):
  222. task = reader.recv()
  223. else:
  224. task = None
  225. sleep(0.5)
  226. except (IOError, EOFError):
  227. outqueues.discard(fd)
  228. continue
  229. else:
  230. if task:
  231. on_state_change(task)
  232. finally:
  233. try:
  234. setblocking(reader, 0)
  235. except (OSError, IOError):
  236. outqueues.discard(fd)
  237. try:
  238. join_exited_workers(shutdown=True)
  239. except WorkersJoined:
  240. debug('result handler: all workers terminated')
  241. return
  242. class AsynPool(_pool.Pool):
  243. """Pool version that uses AIO instead of helper threads."""
  244. ResultHandler = ResultHandler
  245. Worker = Worker
  246. def __init__(self, processes=None, synack=False,
  247. sched_strategy=None, *args, **kwargs):
  248. self.sched_strategy = SCHED_STRATEGIES.get(sched_strategy,
  249. sched_strategy)
  250. processes = self.cpu_count() if processes is None else processes
  251. self.synack = synack
  252. # create queue-pairs for all our processes in advance.
  253. self._queues = dict((self.create_process_queues(), None)
  254. for _ in range(processes))
  255. # inqueue fileno -> process mapping
  256. self._fileno_to_inq = {}
  257. # outqueue fileno -> process mapping
  258. self._fileno_to_outq = {}
  259. # synqueue fileno -> process mapping
  260. self._fileno_to_synq = {}
  261. # We keep track of processes that have not yet
  262. # sent a WORKER_UP message. If a process fails to send
  263. # this message within proc_up_timeout we terminate it
  264. # and hope the next process will recover.
  265. self._proc_alive_timeout = 2.0
  266. self._waiting_to_start = set()
  267. # denormalized set of all inqueues.
  268. self._all_inqueues = set()
  269. # Set of fds being written to (busy)
  270. self._active_writes = set()
  271. # Set of active co-routines currently writing jobs.
  272. self._active_writers = set()
  273. # Set of fds that are busy (executing task)
  274. self._busy_workers = set()
  275. self._mark_worker_as_available = self._busy_workers.discard
  276. # Holds jobs waiting to be written to child processes.
  277. self.outbound_buffer = deque()
  278. self.write_stats = Counter()
  279. super(AsynPool, self).__init__(processes, *args, **kwargs)
  280. for proc in self._pool:
  281. # create initial mappings, these will be updated
  282. # as processes are recycled, or found lost elsewhere.
  283. self._fileno_to_outq[proc.outqR_fd] = proc
  284. self._fileno_to_synq[proc.synqW_fd] = proc
  285. self.on_soft_timeout = self._timeout_handler.on_soft_timeout
  286. self.on_hard_timeout = self._timeout_handler.on_hard_timeout
  287. def _event_process_exit(self, hub, fd):
  288. # This method is called whenever the process sentinel is readable.
  289. hub.remove(fd)
  290. self.maintain_pool()
  291. def register_with_event_loop(self, hub):
  292. """Registers the async pool with the current event loop."""
  293. self._result_handler.register_with_event_loop(hub)
  294. self.handle_result_event = self._result_handler.handle_event
  295. self._create_timelimit_handlers(hub)
  296. self._create_process_handlers(hub)
  297. self._create_write_handlers(hub)
  298. # Add handler for when a process exits (calls maintain_pool)
  299. [hub.add_reader(fd, self._event_process_exit, hub, fd)
  300. for fd in self.process_sentinels]
  301. # Handle_result_event is called whenever one of the
  302. # result queues are readable.
  303. [hub.add_reader(fd, self.handle_result_event, fd)
  304. for fd in self._fileno_to_outq]
  305. # Timers include calling maintain_pool at a regular interval
  306. # to be certain processes are restarted.
  307. for handler, interval in items(self.timers):
  308. hub.call_repeatedly(interval, handler)
  309. hub.on_tick.add(self.on_poll_start)
  310. def _create_timelimit_handlers(self, hub, now=time.time):
  311. """For async pool this sets up the handlers used
  312. to implement time limits."""
  313. call_later = hub.call_later
  314. trefs = self._tref_for_id = WeakValueDictionary()
  315. def on_timeout_set(R, soft, hard):
  316. if soft:
  317. trefs[R._job] = call_later(
  318. soft, self._on_soft_timeout, R._job, soft, hard, hub,
  319. )
  320. elif hard:
  321. trefs[R._job] = call_later(
  322. hard, self._on_hard_timeout, R._job,
  323. )
  324. self.on_timeout_set = on_timeout_set
  325. def _discard_tref(job):
  326. try:
  327. tref = trefs.pop(job)
  328. tref.cancel()
  329. del(tref)
  330. except (KeyError, AttributeError):
  331. pass # out of scope
  332. self._discard_tref = _discard_tref
  333. def on_timeout_cancel(R):
  334. _discard_tref(R._job)
  335. self.on_timeout_cancel = on_timeout_cancel
  336. def _on_soft_timeout(self, job, soft, hard, hub, now=time.time):
  337. # only used by async pool.
  338. if hard:
  339. self._tref_for_id[job] = hub.call_at(
  340. now() + (hard - soft), self._on_hard_timeout, job,
  341. )
  342. try:
  343. result = self._cache[job]
  344. except KeyError:
  345. pass # job ready
  346. else:
  347. self.on_soft_timeout(result)
  348. finally:
  349. if not hard:
  350. # remove tref
  351. self._discard_tref(job)
  352. def _on_hard_timeout(self, job):
  353. # only used by async pool.
  354. try:
  355. result = self._cache[job]
  356. except KeyError:
  357. pass # job ready
  358. else:
  359. self.on_hard_timeout(result)
  360. finally:
  361. # remove tref
  362. self._discard_tref(job)
  363. def on_job_ready(self, job, i, obj, inqW_fd):
  364. self._mark_worker_as_available(inqW_fd)
  365. def _create_process_handlers(self, hub, READ=READ, ERR=ERR):
  366. """For async pool this will create the handlers called
  367. when a process is up/down and etc."""
  368. add_reader, hub_remove = hub.add_reader, hub.remove
  369. cache = self._cache
  370. all_inqueues = self._all_inqueues
  371. fileno_to_inq = self._fileno_to_inq
  372. fileno_to_outq = self._fileno_to_outq
  373. fileno_to_synq = self._fileno_to_synq
  374. busy_workers = self._busy_workers
  375. event_process_exit = self._event_process_exit
  376. handle_result_event = self.handle_result_event
  377. process_flush_queues = self.process_flush_queues
  378. waiting_to_start = self._waiting_to_start
  379. def verify_process_alive(proc):
  380. if proc.exitcode is None and proc in waiting_to_start:
  381. assert proc.outqR_fd in fileno_to_outq
  382. assert fileno_to_outq[proc.outqR_fd] is proc
  383. assert proc.outqR_fd in hub.readers
  384. error('Timed out waiting for UP message from %r', proc)
  385. os.kill(proc.pid, 9)
  386. def on_process_up(proc):
  387. """Called when a process has started."""
  388. # If we got the same fd as a previous process then we will also
  389. # receive jobs in the old buffer, so we need to reset the
  390. # job._write_to and job._scheduled_for attributes used to recover
  391. # message boundaries when processes exit.
  392. infd = proc.inqW_fd
  393. for job in values(cache):
  394. if job._write_to and job._write_to.inqW_fd == infd:
  395. job._write_to = proc
  396. if job._scheduled_for and job._scheduled_for.inqW_fd == infd:
  397. job._scheduled_for = proc
  398. fileno_to_outq[proc.outqR_fd] = proc
  399. # maintain_pool is called whenever a process exits.
  400. add_reader(
  401. proc.sentinel, event_process_exit, hub, proc.sentinel,
  402. )
  403. assert not isblocking(proc.outq._reader)
  404. # handle_result_event is called when the processes outqueue is
  405. # readable.
  406. add_reader(proc.outqR_fd, handle_result_event, proc.outqR_fd)
  407. waiting_to_start.add(proc)
  408. hub.call_later(
  409. self._proc_alive_timeout, verify_process_alive, proc,
  410. )
  411. self.on_process_up = on_process_up
  412. def _remove_from_index(obj, proc, index, callback=None):
  413. # this remove the file descriptors for a process from
  414. # the indices. we have to make sure we don't overwrite
  415. # another processes fds, as the fds may be reused.
  416. try:
  417. fd = obj.fileno()
  418. except (IOError, OSError):
  419. return
  420. try:
  421. if index[fd] is proc:
  422. # fd has not been reused so we can remove it from index.
  423. index.pop(fd, None)
  424. except KeyError:
  425. pass
  426. else:
  427. hub_remove(fd)
  428. if callback is not None:
  429. callback(fd)
  430. return fd
  431. def on_process_down(proc):
  432. """Called when a worker process exits."""
  433. if proc.dead:
  434. return
  435. process_flush_queues(proc)
  436. _remove_from_index(proc.outq._reader, proc, fileno_to_outq)
  437. if proc.synq:
  438. _remove_from_index(proc.synq._writer, proc, fileno_to_synq)
  439. inq = _remove_from_index(proc.inq._writer, proc, fileno_to_inq,
  440. callback=all_inqueues.discard)
  441. if inq:
  442. busy_workers.discard(inq)
  443. hub_remove(proc.sentinel)
  444. self.on_process_down = on_process_down
  445. def _create_write_handlers(self, hub,
  446. pack=struct.pack, dumps=_pickle.dumps,
  447. protocol=HIGHEST_PROTOCOL):
  448. """For async pool this creates the handlers used to write data to
  449. child processes."""
  450. fileno_to_inq = self._fileno_to_inq
  451. fileno_to_synq = self._fileno_to_synq
  452. outbound = self.outbound_buffer
  453. pop_message = outbound.popleft
  454. put_message = outbound.append
  455. all_inqueues = self._all_inqueues
  456. active_writes = self._active_writes
  457. active_writers = self._active_writers
  458. busy_workers = self._busy_workers
  459. diff = all_inqueues.difference
  460. add_reader, add_writer = hub.add_reader, hub.add_writer
  461. hub_add, hub_remove = hub.add, hub.remove
  462. mark_write_fd_as_active = active_writes.add
  463. mark_write_gen_as_active = active_writers.add
  464. mark_worker_as_busy = busy_workers.add
  465. write_generator_done = active_writers.discard
  466. get_job = self._cache.__getitem__
  467. write_stats = self.write_stats
  468. is_fair_strategy = self.sched_strategy == SCHED_STRATEGY_FAIR
  469. revoked_tasks = worker_state.revoked
  470. getpid = os.getpid
  471. precalc = {ACK: self._create_payload(ACK, (0, )),
  472. NACK: self._create_payload(NACK, (0, ))}
  473. def _put_back(job, _time=time.time):
  474. # puts back at the end of the queue
  475. if job._terminated is not None or \
  476. job.correlation_id in revoked_tasks:
  477. if not job._accepted:
  478. job._ack(None, _time(), getpid(), None)
  479. job._set_terminated(job._terminated)
  480. else:
  481. # XXX linear lookup, should find a better way,
  482. # but this happens rarely and is here to protect against races.
  483. if job not in outbound:
  484. outbound.appendleft(job)
  485. self._put_back = _put_back
  486. # called for every event loop iteration, and if there
  487. # are messages pending this will schedule writing one message
  488. # by registering the 'schedule_writes' function for all currently
  489. # inactive inqueues (not already being written to)
  490. # consolidate means the event loop will merge them
  491. # and call the callback once with the list writable fds as
  492. # argument. Using this means we minimize the risk of having
  493. # the same fd receive every task if the pipe read buffer is not
  494. # full.
  495. if is_fair_strategy:
  496. def on_poll_start():
  497. if outbound and len(busy_workers) < len(all_inqueues):
  498. #print('ALL: %r ACTIVE: %r' % (len(all_inqueues),
  499. # len(active_writes)))
  500. inactive = diff(active_writes)
  501. [hub_add(fd, None, WRITE | ERR, consolidate=True)
  502. for fd in inactive]
  503. else:
  504. [hub_remove(fd) for fd in diff(active_writes)]
  505. else:
  506. def on_poll_start(): # noqa
  507. if outbound:
  508. [hub_add(fd, None, WRITE | ERR, consolidate=True)
  509. for fd in diff(active_writes)]
  510. else:
  511. [hub_remove(fd) for fd in diff(active_writes)]
  512. self.on_poll_start = on_poll_start
  513. def on_inqueue_close(fd, proc):
  514. # Makes sure the fd is removed from tracking when
  515. # the connection is closed, this is essential as fds may be reused.
  516. busy_workers.discard(fd)
  517. try:
  518. if fileno_to_inq[fd] is proc:
  519. fileno_to_inq.pop(fd, None)
  520. active_writes.discard(fd)
  521. all_inqueues.discard(fd)
  522. hub_remove(fd)
  523. except KeyError:
  524. pass
  525. self.on_inqueue_close = on_inqueue_close
  526. def schedule_writes(ready_fds, shuffle=random.shuffle):
  527. # Schedule write operation to ready file descriptor.
  528. # The file descriptor is writeable, but that does not
  529. # mean the process is currently reading from the socket.
  530. # The socket is buffered so writeable simply means that
  531. # the buffer can accept at least 1 byte of data.
  532. shuffle(ready_fds)
  533. for ready_fd in ready_fds:
  534. if ready_fd in active_writes:
  535. # already writing to this fd
  536. continue
  537. if is_fair_strategy and ready_fd in busy_workers:
  538. # worker is already busy with another task
  539. continue
  540. if ready_fd not in all_inqueues:
  541. hub_remove(ready_fd)
  542. continue
  543. try:
  544. job = pop_message()
  545. except IndexError:
  546. # no more messages, remove all inactive fds from the hub.
  547. # this is important since the fds are always writeable
  548. # as long as there's 1 byte left in the buffer, and so
  549. # this may create a spinloop where the event loop
  550. # always wakes up.
  551. for inqfd in diff(active_writes):
  552. hub_remove(inqfd)
  553. break
  554. else:
  555. if not job._accepted: # job not accepted by another worker
  556. try:
  557. # keep track of what process the write operation
  558. # was scheduled for.
  559. proc = job._scheduled_for = fileno_to_inq[ready_fd]
  560. except KeyError:
  561. # write was scheduled for this fd but the process
  562. # has since exited and the message must be sent to
  563. # another process.
  564. put_message(job)
  565. continue
  566. cor = _write_job(proc, ready_fd, job)
  567. job._writer = ref(cor)
  568. mark_write_gen_as_active(cor)
  569. mark_write_fd_as_active(ready_fd)
  570. mark_worker_as_busy(ready_fd)
  571. # Try to write immediately, in case there's an error.
  572. try:
  573. next(cor)
  574. except StopIteration:
  575. pass
  576. except OSError as exc:
  577. if get_errno(exc) != errno.EBADF:
  578. raise
  579. else:
  580. add_writer(ready_fd, cor)
  581. hub.consolidate_callback = schedule_writes
  582. def send_job(tup):
  583. # Schedule writing job request for when one of the process
  584. # inqueues are writable.
  585. body = dumps(tup, protocol=protocol)
  586. body_size = len(body)
  587. header = pack('>I', body_size)
  588. # index 1,0 is the job ID.
  589. job = get_job(tup[1][0])
  590. job._payload = header, body, body_size
  591. put_message(job)
  592. self._quick_put = send_job
  593. def on_not_recovering(proc, fd, job):
  594. error('Process inqueue damaged: %r %r' % (proc, proc.exitcode))
  595. if proc.exitcode is not None:
  596. proc.terminate()
  597. hub.remove(fd)
  598. self._put_back(job)
  599. def _write_job(proc, fd, job):
  600. # writes job to the worker process.
  601. # Operation must complete if more than one byte of data
  602. # was written. If the broker connection is lost
  603. # and no data was written the operation shall be cancelled.
  604. header, body, body_size = job._payload
  605. errors = 0
  606. try:
  607. # job result keeps track of what process the job is sent to.
  608. job._write_to = proc
  609. send = proc.send_job_offset
  610. Hw = Bw = 0
  611. # write header
  612. while Hw < 4:
  613. try:
  614. Hw += send(header, Hw)
  615. except Exception as exc:
  616. if get_errno(exc) not in UNAVAIL:
  617. raise
  618. # suspend until more data
  619. errors += 1
  620. if errors > 100:
  621. on_not_recovering(proc, fd, job)
  622. raise StopIteration()
  623. yield
  624. else:
  625. errors = 0
  626. # write body
  627. while Bw < body_size:
  628. try:
  629. Bw += send(body, Bw)
  630. except Exception as exc:
  631. if get_errno(exc) not in UNAVAIL:
  632. raise
  633. # suspend until more data
  634. errors += 1
  635. if errors > 100:
  636. on_not_recovering(proc, fd, job)
  637. raise StopIteration()
  638. yield
  639. else:
  640. errors = 0
  641. finally:
  642. hub_remove(fd)
  643. write_stats[proc.index] += 1
  644. # message written, so this fd is now available
  645. active_writes.discard(fd)
  646. write_generator_done(job._writer()) # is a weakref
  647. def send_ack(response, pid, job, fd, WRITE=WRITE, ERR=ERR):
  648. # Only used when synack is enabled.
  649. # Schedule writing ack response for when the fd is writeable.
  650. msg = Ack(job, fd, precalc[response])
  651. callback = promise(write_generator_done)
  652. cor = _write_ack(fd, msg, callback=callback)
  653. mark_write_gen_as_active(cor)
  654. mark_write_fd_as_active(fd)
  655. callback.args = (cor, )
  656. add_writer(fd, cor)
  657. self.send_ack = send_ack
  658. def _write_ack(fd, ack, callback=None):
  659. # writes ack back to the worker if synack enabled.
  660. # this operation *MUST* complete, otherwise
  661. # the worker process will hang waiting for the ack.
  662. header, body, body_size = ack[2]
  663. try:
  664. try:
  665. proc = fileno_to_synq[fd]
  666. except KeyError:
  667. # process died, we can safely discard the ack at this
  668. # point.
  669. raise StopIteration()
  670. send = proc.send_syn_offset
  671. Hw = Bw = 0
  672. # write header
  673. while Hw < 4:
  674. try:
  675. Hw += send(header, Hw)
  676. except Exception as exc:
  677. if get_errno(exc) not in UNAVAIL:
  678. raise
  679. yield
  680. # write body
  681. while Bw < body_size:
  682. try:
  683. Bw += send(body, Bw)
  684. except Exception as exc:
  685. if get_errno(exc) not in UNAVAIL:
  686. raise
  687. # suspend until more data
  688. yield
  689. finally:
  690. if callback:
  691. callback()
  692. # message written, so this fd is now available
  693. active_writes.discard(fd)
  694. def flush(self):
  695. if self._state == TERMINATE:
  696. return
  697. # cancel all tasks that have not been accepted so that NACK is sent.
  698. for job in values(self._cache):
  699. if not job._accepted:
  700. job._cancel()
  701. # clear the outgoing buffer as the tasks will be redelivered by
  702. # the broker anyway.
  703. if self.outbound_buffer:
  704. self.outbound_buffer.clear()
  705. self.maintain_pool()
  706. try:
  707. # ...but we must continue writing the payloads we already started
  708. # to keep message boundaries.
  709. # The messages may be NACK'ed later if synack is enabled.
  710. if self._state == RUN:
  711. # flush outgoing buffers
  712. intervals = fxrange(0.01, 0.1, 0.01, repeatlast=True)
  713. owned_by = {}
  714. for job in values(self._cache):
  715. writer = _get_job_writer(job)
  716. if writer is not None:
  717. owned_by[writer] = job
  718. while self._active_writers:
  719. writers = list(self._active_writers)
  720. for gen in writers:
  721. if (gen.__name__ == '_write_job' and
  722. gen_not_started(gen)):
  723. # has not started writing the job so can
  724. # discard the task, but we must also remove
  725. # it from the Pool._cache.
  726. try:
  727. job = owned_by[gen]
  728. except KeyError:
  729. pass
  730. else:
  731. # removes from Pool._cache
  732. job.discard()
  733. self._active_writers.discard(gen)
  734. else:
  735. try:
  736. job = owned_by[gen]
  737. except KeyError:
  738. pass
  739. else:
  740. job_proc = job._write_to
  741. if job_proc.exitcode is None:
  742. self._flush_writer(job_proc, gen)
  743. # workers may have exited in the meantime.
  744. self.maintain_pool()
  745. sleep(next(intervals)) # don't busyloop
  746. finally:
  747. self.outbound_buffer.clear()
  748. self._active_writers.clear()
  749. self._active_writes.clear()
  750. self._busy_workers.clear()
  751. def _flush_writer(self, proc, writer):
  752. fds = set([proc.inq._writer])
  753. try:
  754. while fds:
  755. if proc.exitcode:
  756. break # process exited
  757. readable, writable, again = _select(
  758. writers=fds, err=fds, timeout=0.5,
  759. )
  760. if not again and (writable or readable):
  761. try:
  762. next(writer)
  763. except (StopIteration, OSError, IOError, EOFError):
  764. break
  765. finally:
  766. self._active_writers.discard(writer)
  767. def get_process_queues(self):
  768. """Get queues for a new process.
  769. Here we will find an unused slot, as there should always
  770. be one available when we start a new process.
  771. """
  772. return next(q for q, owner in items(self._queues)
  773. if owner is None)
  774. def on_grow(self, n):
  775. """Grow the pool by ``n`` proceses."""
  776. diff = max(self._processes - len(self._queues), 0)
  777. if diff:
  778. self._queues.update(
  779. dict((self.create_process_queues(), None) for _ in range(diff))
  780. )
  781. def on_shrink(self, n):
  782. """Shrink the pool by ``n`` processes."""
  783. pass
  784. def create_process_queues(self):
  785. """Creates new in, out (and optionally syn) queues,
  786. returned as a tuple."""
  787. # NOTE: Pipes must be set O_NONBLOCK at creation time (the original
  788. # fd), otherwise it will not be possible to change the flags until
  789. # there is an actual reader/writer on the other side.
  790. inq = _SimpleQueue(wnonblock=True)
  791. outq = _SimpleQueue(rnonblock=True)
  792. synq = None
  793. assert isblocking(inq._reader)
  794. assert not isblocking(inq._writer)
  795. assert not isblocking(outq._reader)
  796. assert isblocking(outq._writer)
  797. if self.synack:
  798. synq = _SimpleQueue(wnonblock=True)
  799. assert isblocking(synq._reader)
  800. assert not isblocking(synq._writer)
  801. return inq, outq, synq
  802. def on_process_alive(self, pid):
  803. """Handler called when the WORKER_UP message is received
  804. from a child process, which marks the process as ready
  805. to receive work."""
  806. try:
  807. proc = next(w for w in self._pool if w.pid == pid)
  808. except StopIteration:
  809. # process already exited :( this will be handled elsewhere.
  810. return
  811. assert proc.inqW_fd not in self._fileno_to_inq
  812. assert proc.inqW_fd not in self._all_inqueues
  813. self._waiting_to_start.discard(proc)
  814. self._fileno_to_inq[proc.inqW_fd] = proc
  815. self._fileno_to_synq[proc.synqW_fd] = proc
  816. self._all_inqueues.add(proc.inqW_fd)
  817. def on_job_process_down(self, job, pid_gone):
  818. """Handler called for each job when the process it was assigned to
  819. exits."""
  820. if job._write_to and job._write_to.exitcode:
  821. # job was partially written
  822. self.on_partial_read(job, job._write_to)
  823. elif job._scheduled_for and job._scheduled_for.exitcode:
  824. # job was only scheduled to be written to this process,
  825. # but no data was sent so put it back on the outbound_buffer.
  826. self._put_back(job)
  827. def on_job_process_lost(self, job, pid, exitcode):
  828. """Handler called for each *started* job when the process it
  829. was assigned to exited by mysterious means (error exitcodes and
  830. signals)"""
  831. self.mark_as_worker_lost(job, exitcode)
  832. def human_write_stats(self):
  833. if self.write_stats is None:
  834. return 'N/A'
  835. vals = list(values(self.write_stats))
  836. total = sum(vals)
  837. def per(v, total):
  838. return '{0:.2f}%'.format((float(v) / total) * 100.0 if v else 0)
  839. return {
  840. 'total': total,
  841. 'avg': per(total / len(self.write_stats) if total else 0, total),
  842. 'all': ', '.join(per(v, total) for v in vals),
  843. 'raw': ', '.join(map(str, vals)),
  844. 'inqueues': {
  845. 'total': len(self._all_inqueues),
  846. 'active': len(self._active_writes),
  847. }
  848. }
  849. def _process_cleanup_queues(self, proc):
  850. """Handler called to clean up a processes queues after process
  851. exit."""
  852. if not proc.dead:
  853. try:
  854. self._queues[self._find_worker_queues(proc)] = None
  855. except (KeyError, ValueError):
  856. pass
  857. @staticmethod
  858. def _stop_task_handler(task_handler):
  859. """Called at shutdown to tell processes that we are shutting down."""
  860. for proc in task_handler.pool:
  861. proc.inq._writer.setblocking(1)
  862. try:
  863. proc.inq.put(None)
  864. except OSError as exc:
  865. if get_errno(exc) != errno.EBADF:
  866. raise
  867. def create_result_handler(self):
  868. return super(AsynPool, self).create_result_handler(
  869. fileno_to_outq=self._fileno_to_outq,
  870. on_process_alive=self.on_process_alive,
  871. )
  872. def _process_register_queues(self, proc, queues):
  873. """Marks new ownership for ``queues`` so that the fileno indices are
  874. updated."""
  875. assert queues in self._queues
  876. b = len(self._queues)
  877. self._queues[queues] = proc
  878. assert b == len(self._queues)
  879. def _find_worker_queues(self, proc):
  880. """Find the queues owned by ``proc``."""
  881. try:
  882. return next(q for q, owner in items(self._queues)
  883. if owner == proc)
  884. except StopIteration:
  885. raise ValueError(proc)
  886. def _setup_queues(self):
  887. # this is only used by the original pool which uses a shared
  888. # queue for all processes.
  889. # these attributes makes no sense for us, but we will still
  890. # have to initialize them.
  891. self._inqueue = self._outqueue = \
  892. self._quick_put = self._quick_get = self._poll_result = None
  893. def process_flush_queues(self, proc):
  894. """Flushes all queues, including the outbound buffer, so that
  895. all tasks that have not been started will be discarded.
  896. In Celery this is called whenever the transport connection is lost
  897. (consumer restart).
  898. """
  899. resq = proc.outq._reader
  900. on_state_change = self._result_handler.on_state_change
  901. fds = set([resq])
  902. while fds and not resq.closed and self._state != TERMINATE:
  903. readable, _, again = _select(fds, None, fds, timeout=0.01)
  904. if readable:
  905. try:
  906. task = resq.recv()
  907. except (OSError, IOError, EOFError) as exc:
  908. if get_errno(exc) not in UNAVAIL:
  909. debug('got %r while flushing process %r',
  910. exc, proc, exc_info=1)
  911. break
  912. else:
  913. if task is None:
  914. debug('got sentinel while flushing process %r', proc)
  915. break
  916. else:
  917. on_state_change(task)
  918. else:
  919. break
  920. def on_partial_read(self, job, proc):
  921. """Called when a job was only partially written to a child process
  922. and it exited."""
  923. # worker terminated by signal:
  924. # we cannot reuse the sockets again, because we don't know if
  925. # the process wrote/read anything frmo them, and if so we cannot
  926. # restore the message boundaries.
  927. if not job._accepted:
  928. # job was not acked, so find another worker to send it to.
  929. self._put_back(job)
  930. writer = _get_job_writer(job)
  931. if writer:
  932. self._active_writers.discard(writer)
  933. del(writer)
  934. if not proc.dead:
  935. proc.dead = True
  936. # Replace queues to avoid reuse
  937. before = len(self._queues)
  938. try:
  939. queues = self._find_worker_queues(proc)
  940. if self.destroy_queues(queues, proc):
  941. self._queues[self.create_process_queues()] = None
  942. except ValueError:
  943. pass
  944. # Not in queue map, make sure sockets are closed.
  945. #self.destroy_queues((proc.inq, proc.outq, proc.synq))
  946. assert len(self._queues) == before
  947. def destroy_queues(self, queues, proc):
  948. """Destroy queues that can no longer be used, so that they
  949. be replaced by new sockets."""
  950. assert proc.exitcode is not None
  951. self._waiting_to_start.discard(proc)
  952. removed = 1
  953. try:
  954. self._queues.pop(queues)
  955. except KeyError:
  956. removed = 0
  957. try:
  958. self.on_inqueue_close(queues[0]._writer.fileno(), proc)
  959. except IOError:
  960. pass
  961. for queue in queues:
  962. if queue:
  963. for sock in (queue._reader, queue._writer):
  964. if not sock.closed:
  965. try:
  966. sock.close()
  967. except (IOError, OSError):
  968. pass
  969. return removed
  970. def _create_payload(self, type_, args,
  971. dumps=_pickle.dumps, pack=struct.pack,
  972. protocol=HIGHEST_PROTOCOL):
  973. body = dumps((type_, args), protocol=protocol)
  974. size = len(body)
  975. header = pack('>I', size)
  976. return header, body, size
  977. @classmethod
  978. def _set_result_sentinel(cls, _outqueue, _pool):
  979. # unused
  980. pass
  981. def _help_stuff_finish_args(self):
  982. # Pool._help_stuff_finished is a classmethod so we have to use this
  983. # trick to modify the arguments passed to it.
  984. return (self._pool, )
  985. @classmethod
  986. def _help_stuff_finish(cls, pool):
  987. debug(
  988. 'removing tasks from inqueue until task handler finished',
  989. )
  990. fileno_to_proc = {}
  991. inqR = set()
  992. for w in pool:
  993. try:
  994. fd = w.inq._reader.fileno()
  995. inqR.add(fd)
  996. fileno_to_proc[fd] = w
  997. except IOError:
  998. pass
  999. while inqR:
  1000. readable, _, again = _select(inqR, timeout=0.5)
  1001. if again:
  1002. continue
  1003. if not readable:
  1004. break
  1005. for fd in readable:
  1006. fileno_to_proc[fd].inq._reader.recv()
  1007. sleep(0)
  1008. @property
  1009. def timers(self):
  1010. return {self.maintain_pool: 5.0}