fsa.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715
  1. from __future__ import print_function
  2. import itertools
  3. import operator
  4. import sys
  5. from bisect import bisect_left
  6. from collections import defaultdict
  7. from whoosh.compat import iteritems, next, text_type, unichr, xrange
  8. unull = unichr(0)
  9. # Marker constants
  10. class Marker(object):
  11. def __init__(self, name):
  12. self.name = name
  13. def __repr__(self):
  14. return "<%s>" % self.name
  15. EPSILON = Marker("EPSILON")
  16. ANY = Marker("ANY")
  17. # Base class
  18. class FSA(object):
  19. def __init__(self, initial):
  20. self.initial = initial
  21. self.transitions = {}
  22. self.final_states = set()
  23. def __len__(self):
  24. return len(self.all_states())
  25. def __eq__(self, other):
  26. if self.initial != other.initial:
  27. return False
  28. if self.final_states != other.final_states:
  29. return False
  30. st = self.transitions
  31. ot = other.transitions
  32. if list(st) != list(ot):
  33. return False
  34. for key in st:
  35. if st[key] != ot[key]:
  36. return False
  37. return True
  38. def all_states(self):
  39. stateset = set(self.transitions)
  40. for src, trans in iteritems(self.transitions):
  41. stateset.update(trans.values())
  42. return stateset
  43. def all_labels(self):
  44. labels = set()
  45. for src, trans in iteritems(self.transitions):
  46. labels.update(trans)
  47. return labels
  48. def get_labels(self, src):
  49. return iter(self.transitions.get(src, []))
  50. def generate_all(self, state=None, sofar=""):
  51. state = self.start() if state is None else state
  52. if self.is_final(state):
  53. yield sofar
  54. for label in sorted(self.get_labels(state)):
  55. newstate = self.next_state(state, label)
  56. for string in self.generate_all(newstate, sofar + label):
  57. yield string
  58. def start(self):
  59. return self.initial
  60. def next_state(self, state, label):
  61. raise NotImplementedError
  62. def is_final(self, state):
  63. raise NotImplementedError
  64. def add_transition(self, src, label, dest):
  65. raise NotImplementedError
  66. def add_final_state(self, state):
  67. raise NotImplementedError
  68. def to_dfa(self):
  69. raise NotImplementedError
  70. def accept(self, string, debug=False):
  71. state = self.start()
  72. for label in string:
  73. if debug:
  74. print(" ", state, "->", label, "->")
  75. state = self.next_state(state, label)
  76. if not state:
  77. break
  78. return self.is_final(state)
  79. def append(self, fsa):
  80. self.transitions.update(fsa.transitions)
  81. for state in self.final_states:
  82. self.add_transition(state, EPSILON, fsa.initial)
  83. self.final_states = fsa.final_states
  84. # Implementations
  85. class NFA(FSA):
  86. def __init__(self, initial):
  87. self.transitions = {}
  88. self.final_states = set()
  89. self.initial = initial
  90. def dump(self, stream=sys.stdout):
  91. starts = self.start()
  92. for src in self.transitions:
  93. beg = "@" if src in starts else " "
  94. print(beg, src, file=stream)
  95. xs = self.transitions[src]
  96. for label in xs:
  97. dests = xs[label]
  98. end = "||" if self.is_final(dests) else ""
  99. def start(self):
  100. return frozenset(self._expand(set([self.initial])))
  101. def add_transition(self, src, label, dest):
  102. self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest)
  103. def add_final_state(self, state):
  104. self.final_states.add(state)
  105. def triples(self):
  106. for src, trans in iteritems(self.transitions):
  107. for label, dests in iteritems(trans):
  108. for dest in dests:
  109. yield src, label, dest
  110. def is_final(self, states):
  111. return bool(self.final_states.intersection(states))
  112. def _expand(self, states):
  113. transitions = self.transitions
  114. frontier = set(states)
  115. while frontier:
  116. state = frontier.pop()
  117. if state in transitions and EPSILON in transitions[state]:
  118. new_states = transitions[state][EPSILON].difference(states)
  119. frontier.update(new_states)
  120. states.update(new_states)
  121. return states
  122. def next_state(self, states, label):
  123. transitions = self.transitions
  124. dest_states = set()
  125. for state in states:
  126. if state in transitions:
  127. xs = transitions[state]
  128. if label in xs:
  129. dest_states.update(xs[label])
  130. if ANY in xs:
  131. dest_states.update(xs[ANY])
  132. return frozenset(self._expand(dest_states))
  133. def get_labels(self, states):
  134. transitions = self.transitions
  135. labels = set()
  136. for state in states:
  137. if state in transitions:
  138. labels.update(transitions[state])
  139. return labels
  140. def embed(self, other):
  141. # Copy all transitions from the other NFA into this one
  142. for s, othertrans in iteritems(other.transitions):
  143. trans = self.transitions.setdefault(s, {})
  144. for label, otherdests in iteritems(othertrans):
  145. dests = trans.setdefault(label, set())
  146. dests.update(otherdests)
  147. def insert(self, src, other, dest):
  148. self.embed(other)
  149. # Connect src to the other NFA's initial state, and the other
  150. # NFA's final states to dest
  151. self.add_transition(src, EPSILON, other.initial)
  152. for finalstate in other.final_states:
  153. self.add_transition(finalstate, EPSILON, dest)
  154. def to_dfa(self):
  155. dfa = DFA(self.start())
  156. frontier = [self.start()]
  157. seen = set()
  158. while frontier:
  159. current = frontier.pop()
  160. if self.is_final(current):
  161. dfa.add_final_state(current)
  162. labels = self.get_labels(current)
  163. for label in labels:
  164. if label is EPSILON:
  165. continue
  166. new_state = self.next_state(current, label)
  167. if new_state not in seen:
  168. frontier.append(new_state)
  169. seen.add(new_state)
  170. if self.is_final(new_state):
  171. dfa.add_final_state(new_state)
  172. if label is ANY:
  173. dfa.set_default_transition(current, new_state)
  174. else:
  175. dfa.add_transition(current, label, new_state)
  176. return dfa
  177. class DFA(FSA):
  178. def __init__(self, initial):
  179. self.initial = initial
  180. self.transitions = {}
  181. self.defaults = {}
  182. self.final_states = set()
  183. self.outlabels = {}
  184. def dump(self, stream=sys.stdout):
  185. for src in sorted(self.transitions):
  186. beg = "@" if src == self.initial else " "
  187. print(beg, src, file=stream)
  188. xs = self.transitions[src]
  189. for label in sorted(xs):
  190. dest = xs[label]
  191. end = "||" if self.is_final(dest) else ""
  192. def start(self):
  193. return self.initial
  194. def add_transition(self, src, label, dest):
  195. self.transitions.setdefault(src, {})[label] = dest
  196. def set_default_transition(self, src, dest):
  197. self.defaults[src] = dest
  198. def add_final_state(self, state):
  199. self.final_states.add(state)
  200. def is_final(self, state):
  201. return state in self.final_states
  202. def next_state(self, src, label):
  203. trans = self.transitions.get(src, {})
  204. return trans.get(label, self.defaults.get(src, None))
  205. def next_valid_string(self, string, asbytes=False):
  206. state = self.start()
  207. stack = []
  208. # Follow the DFA as far as possible
  209. i = 0
  210. for i, label in enumerate(string):
  211. stack.append((string[:i], state, label))
  212. state = self.next_state(state, label)
  213. if not state:
  214. break
  215. else:
  216. stack.append((string[:i + 1], state, None))
  217. if self.is_final(state):
  218. # Word is already valid
  219. return string
  220. # Perform a 'wall following' search for the lexicographically smallest
  221. # accepting state.
  222. while stack:
  223. path, state, label = stack.pop()
  224. label = self.find_next_edge(state, label, asbytes=asbytes)
  225. if label:
  226. path += label
  227. state = self.next_state(state, label)
  228. if self.is_final(state):
  229. return path
  230. stack.append((path, state, None))
  231. return None
  232. def find_next_edge(self, s, label, asbytes):
  233. if label is None:
  234. label = b"\x00" if asbytes else u'\0'
  235. else:
  236. label = (label + 1) if asbytes else unichr(ord(label) + 1)
  237. trans = self.transitions.get(s, {})
  238. if label in trans or s in self.defaults:
  239. return label
  240. try:
  241. labels = self.outlabels[s]
  242. except KeyError:
  243. self.outlabels[s] = labels = sorted(trans)
  244. pos = bisect_left(labels, label)
  245. if pos < len(labels):
  246. return labels[pos]
  247. return None
  248. def reachable_from(self, src, inclusive=True):
  249. transitions = self.transitions
  250. reached = set()
  251. if inclusive:
  252. reached.add(src)
  253. stack = [src]
  254. seen = set()
  255. while stack:
  256. src = stack.pop()
  257. seen.add(src)
  258. for _, dest in iteritems(transitions[src]):
  259. reached.add(dest)
  260. if dest not in seen:
  261. stack.append(dest)
  262. return reached
  263. def minimize(self):
  264. transitions = self.transitions
  265. initial = self.initial
  266. # Step 1: Delete unreachable states
  267. reachable = self.reachable_from(initial)
  268. for src in list(transitions):
  269. if src not in reachable:
  270. del transitions[src]
  271. final_states = self.final_states.intersection(reachable)
  272. labels = self.all_labels()
  273. # Step 2: Partition the states into equivalence sets
  274. changed = True
  275. parts = [final_states, reachable - final_states]
  276. while changed:
  277. changed = False
  278. for i in xrange(len(parts)):
  279. part = parts[i]
  280. changed_part = False
  281. for label in labels:
  282. next_part = None
  283. new_part = set()
  284. for state in part:
  285. dest = transitions[state].get(label)
  286. if dest is not None:
  287. if next_part is None:
  288. for p in parts:
  289. if dest in p:
  290. next_part = p
  291. elif dest not in next_part:
  292. new_part.add(state)
  293. changed = True
  294. changed_part = True
  295. if changed_part:
  296. old_part = part - new_part
  297. parts.pop(i)
  298. parts.append(old_part)
  299. parts.append(new_part)
  300. break
  301. # Choose one state from each equivalence set and map all equivalent
  302. # states to it
  303. new_trans = {}
  304. # Create mapping
  305. mapping = {}
  306. new_initial = None
  307. for part in parts:
  308. representative = part.pop()
  309. if representative is initial:
  310. new_initial = representative
  311. mapping[representative] = representative
  312. new_trans[representative] = {}
  313. for state in part:
  314. if state is initial:
  315. new_initial = representative
  316. mapping[state] = representative
  317. assert new_initial is not None
  318. # Apply mapping to existing transitions
  319. new_finals = set(mapping[s] for s in final_states)
  320. for state, d in iteritems(new_trans):
  321. trans = transitions[state]
  322. for label, dest in iteritems(trans):
  323. d[label] = mapping[dest]
  324. # Remove dead states - non-final states with no outgoing arcs except
  325. # to themselves
  326. non_final_srcs = [src for src in new_trans if src not in new_finals]
  327. removing = set()
  328. for src in non_final_srcs:
  329. dests = set(new_trans[src].values())
  330. dests.discard(src)
  331. if not dests:
  332. removing.add(src)
  333. del new_trans[src]
  334. # Delete transitions to removed dead states
  335. for t in new_trans.values():
  336. for label in list(t):
  337. if t[label] in removing:
  338. del t[label]
  339. self.transitions = new_trans
  340. self.initial = new_initial
  341. self.final_states = new_finals
  342. def to_dfa(self):
  343. return self
  344. # Useful functions
  345. def renumber_dfa(dfa, base=0):
  346. c = itertools.count(base)
  347. mapping = {}
  348. def remap(state):
  349. if state in mapping:
  350. newnum = mapping[state]
  351. else:
  352. newnum = next(c)
  353. mapping[state] = newnum
  354. return newnum
  355. newdfa = DFA(remap(dfa.initial))
  356. for src, trans in iteritems(dfa.transitions):
  357. for label, dest in iteritems(trans):
  358. newdfa.add_transition(remap(src), label, remap(dest))
  359. for finalstate in dfa.final_states:
  360. newdfa.add_final_state(remap(finalstate))
  361. for src, dest in iteritems(dfa.defaults):
  362. newdfa.set_default_transition(remap(src), remap(dest))
  363. return newdfa
  364. def u_to_utf8(dfa, base=0):
  365. c = itertools.count(base)
  366. transitions = dfa.transitions
  367. for src, trans in iteritems(transitions):
  368. trans = transitions[src]
  369. for label, dest in list(iteritems(trans)):
  370. if label is EPSILON:
  371. continue
  372. elif label is ANY:
  373. raise Exception
  374. else:
  375. assert isinstance(label, text_type)
  376. label8 = label.encode("utf8")
  377. for i, byte in enumerate(label8):
  378. if i < len(label8) - 1:
  379. st = next(c)
  380. dfa.add_transition(src, byte, st)
  381. src = st
  382. else:
  383. dfa.add_transition(src, byte, dest)
  384. del trans[label]
  385. def find_all_matches(dfa, lookup_func, first=unull):
  386. """
  387. Uses lookup_func to find all words within levenshtein distance k of word.
  388. Args:
  389. word: The word to look up
  390. k: Maximum edit distance
  391. lookup_func: A single argument function that returns the first word in the
  392. database that is greater than or equal to the input argument.
  393. Yields:
  394. Every matching word within levenshtein distance k from the database.
  395. """
  396. match = dfa.next_valid_string(first)
  397. while match:
  398. key = lookup_func(match)
  399. if key is None:
  400. return
  401. if match == key:
  402. yield match
  403. key += unull
  404. match = dfa.next_valid_string(key)
  405. # Construction functions
  406. def reverse_nfa(n):
  407. s = object()
  408. nfa = NFA(s)
  409. for src, trans in iteritems(n.transitions):
  410. for label, destset in iteritems(trans):
  411. for dest in destset:
  412. nfa.add_transition(dest, label, src)
  413. for finalstate in n.final_states:
  414. nfa.add_transition(s, EPSILON, finalstate)
  415. nfa.add_final_state(n.initial)
  416. return nfa
  417. def product(dfa1, op, dfa2):
  418. dfa1 = dfa1.to_dfa()
  419. dfa2 = dfa2.to_dfa()
  420. start = (dfa1.start(), dfa2.start())
  421. dfa = DFA(start)
  422. stack = [start]
  423. while stack:
  424. src = stack.pop()
  425. state1, state2 = src
  426. trans1 = set(dfa1.transitions[state1])
  427. trans2 = set(dfa2.transitions[state2])
  428. for label in trans1.intersection(trans2):
  429. state1 = dfa1.next_state(state1, label)
  430. state2 = dfa2.next_state(state2, label)
  431. if op(state1 is not None, state2 is not None):
  432. dest = (state1, state2)
  433. dfa.add_transition(src, label, dest)
  434. stack.append(dest)
  435. if op(dfa1.is_final(state1), dfa2.is_final(state2)):
  436. dfa.add_final_state(dest)
  437. return dfa
  438. def intersection(dfa1, dfa2):
  439. return product(dfa1, operator.and_, dfa2)
  440. def union(dfa1, dfa2):
  441. return product(dfa1, operator.or_, dfa2)
  442. def epsilon_nfa():
  443. return basic_nfa(EPSILON)
  444. def dot_nfa():
  445. return basic_nfa(ANY)
  446. def basic_nfa(label):
  447. s = object()
  448. e = object()
  449. nfa = NFA(s)
  450. nfa.add_transition(s, label, e)
  451. nfa.add_final_state(e)
  452. return nfa
  453. def charset_nfa(labels):
  454. s = object()
  455. e = object()
  456. nfa = NFA(s)
  457. for label in labels:
  458. nfa.add_transition(s, label, e)
  459. nfa.add_final_state(e)
  460. return nfa
  461. def string_nfa(string):
  462. s = object()
  463. e = object()
  464. nfa = NFA(s)
  465. for label in string:
  466. e = object()
  467. nfa.add_transition(s, label, e)
  468. s = e
  469. nfa.add_final_state(e)
  470. return nfa
  471. def choice_nfa(n1, n2):
  472. s = object()
  473. e = object()
  474. nfa = NFA(s)
  475. # -> nfa1 -
  476. # / \
  477. # s e
  478. # \ /
  479. # -> nfa2 -
  480. nfa.insert(s, n1, e)
  481. nfa.insert(s, n2, e)
  482. nfa.add_final_state(e)
  483. return nfa
  484. def concat_nfa(n1, n2):
  485. s = object()
  486. m = object()
  487. e = object()
  488. nfa = NFA(s)
  489. nfa.insert(s, n1, m)
  490. nfa.insert(m, n2, e)
  491. nfa.add_final_state(e)
  492. return nfa
  493. def star_nfa(n):
  494. s = object()
  495. e = object()
  496. nfa = NFA(s)
  497. # -----<-----
  498. # / \
  499. # s ---> n ---> e
  500. # \ /
  501. # ----->-----
  502. nfa.insert(s, n, e)
  503. nfa.add_transition(s, EPSILON, e)
  504. for finalstate in n.final_states:
  505. nfa.add_transition(finalstate, EPSILON, s)
  506. nfa.add_final_state(e)
  507. return nfa
  508. def plus_nfa(n):
  509. return concat_nfa(n, star_nfa(n))
  510. def optional_nfa(n):
  511. return choice_nfa(n, epsilon_nfa())
  512. # Daciuk Mihov DFA construction algorithm
  513. class DMNode(object):
  514. def __init__(self, n):
  515. self.n = n
  516. self.arcs = {}
  517. self.final = False
  518. def __repr__(self):
  519. return "<%s, %r>" % (self.n, self.tuple())
  520. def __hash__(self):
  521. return hash(self.tuple())
  522. def tuple(self):
  523. arcs = tuple(sorted(iteritems(self.arcs)))
  524. return arcs, self.final
  525. def strings_dfa(strings):
  526. dfa = DFA(0)
  527. c = itertools.count(1)
  528. last = ""
  529. seen = {}
  530. nodes = [DMNode(0)]
  531. for string in strings:
  532. if string <= last:
  533. raise Exception("Strings must be in order")
  534. if not string:
  535. raise Exception("Can't add empty string")
  536. # Find the common prefix with the previous string
  537. i = 0
  538. while i < len(last) and i < len(string) and last[i] == string[i]:
  539. i += 1
  540. prefixlen = i
  541. # Freeze the transitions after the prefix, since they're not shared
  542. add_suffix(dfa, nodes, last, prefixlen + 1, seen)
  543. # Create new nodes for the substring after the prefix
  544. for label in string[prefixlen:]:
  545. node = DMNode(next(c))
  546. # Create an arc from the previous node to this node
  547. nodes[-1].arcs[label] = node.n
  548. nodes.append(node)
  549. # Mark the last node as an accept state
  550. nodes[-1].final = True
  551. last = string
  552. if len(nodes) > 1:
  553. add_suffix(dfa, nodes, last, 0, seen)
  554. return dfa
  555. def add_suffix(dfa, nodes, last, downto, seen):
  556. while len(nodes) > downto:
  557. node = nodes.pop()
  558. tup = node.tuple()
  559. # If a node just like this one (final/nonfinal, same arcs to same
  560. # destinations) is already seen, replace with it
  561. try:
  562. this = seen[tup]
  563. except KeyError:
  564. this = node.n
  565. if node.final:
  566. dfa.add_final_state(this)
  567. seen[tup] = this
  568. else:
  569. # If we replaced the node with an already seen one, fix the parent
  570. # node's pointer to this
  571. parent = nodes[-1]
  572. inlabel = last[len(nodes) - 1]
  573. parent.arcs[inlabel] = this
  574. # Add the node's transitions to the DFA
  575. for label, dest in iteritems(node.arcs):
  576. dfa.add_transition(this, label, dest)