123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715 |
- from __future__ import print_function
- import itertools
- import operator
- import sys
- from bisect import bisect_left
- from collections import defaultdict
- from whoosh.compat import iteritems, next, text_type, unichr, xrange
- unull = unichr(0)
- # Marker constants
- class Marker(object):
- def __init__(self, name):
- self.name = name
- def __repr__(self):
- return "<%s>" % self.name
- EPSILON = Marker("EPSILON")
- ANY = Marker("ANY")
- # Base class
- class FSA(object):
- def __init__(self, initial):
- self.initial = initial
- self.transitions = {}
- self.final_states = set()
- def __len__(self):
- return len(self.all_states())
- def __eq__(self, other):
- if self.initial != other.initial:
- return False
- if self.final_states != other.final_states:
- return False
- st = self.transitions
- ot = other.transitions
- if list(st) != list(ot):
- return False
- for key in st:
- if st[key] != ot[key]:
- return False
- return True
- def all_states(self):
- stateset = set(self.transitions)
- for src, trans in iteritems(self.transitions):
- stateset.update(trans.values())
- return stateset
- def all_labels(self):
- labels = set()
- for src, trans in iteritems(self.transitions):
- labels.update(trans)
- return labels
- def get_labels(self, src):
- return iter(self.transitions.get(src, []))
- def generate_all(self, state=None, sofar=""):
- state = self.start() if state is None else state
- if self.is_final(state):
- yield sofar
- for label in sorted(self.get_labels(state)):
- newstate = self.next_state(state, label)
- for string in self.generate_all(newstate, sofar + label):
- yield string
- def start(self):
- return self.initial
- def next_state(self, state, label):
- raise NotImplementedError
- def is_final(self, state):
- raise NotImplementedError
- def add_transition(self, src, label, dest):
- raise NotImplementedError
- def add_final_state(self, state):
- raise NotImplementedError
- def to_dfa(self):
- raise NotImplementedError
- def accept(self, string, debug=False):
- state = self.start()
- for label in string:
- if debug:
- print(" ", state, "->", label, "->")
- state = self.next_state(state, label)
- if not state:
- break
- return self.is_final(state)
- def append(self, fsa):
- self.transitions.update(fsa.transitions)
- for state in self.final_states:
- self.add_transition(state, EPSILON, fsa.initial)
- self.final_states = fsa.final_states
- # Implementations
- class NFA(FSA):
- def __init__(self, initial):
- self.transitions = {}
- self.final_states = set()
- self.initial = initial
- def dump(self, stream=sys.stdout):
- starts = self.start()
- for src in self.transitions:
- beg = "@" if src in starts else " "
- print(beg, src, file=stream)
- xs = self.transitions[src]
- for label in xs:
- dests = xs[label]
- end = "||" if self.is_final(dests) else ""
- def start(self):
- return frozenset(self._expand(set([self.initial])))
- def add_transition(self, src, label, dest):
- self.transitions.setdefault(src, {}).setdefault(label, set()).add(dest)
- def add_final_state(self, state):
- self.final_states.add(state)
- def triples(self):
- for src, trans in iteritems(self.transitions):
- for label, dests in iteritems(trans):
- for dest in dests:
- yield src, label, dest
- def is_final(self, states):
- return bool(self.final_states.intersection(states))
- def _expand(self, states):
- transitions = self.transitions
- frontier = set(states)
- while frontier:
- state = frontier.pop()
- if state in transitions and EPSILON in transitions[state]:
- new_states = transitions[state][EPSILON].difference(states)
- frontier.update(new_states)
- states.update(new_states)
- return states
- def next_state(self, states, label):
- transitions = self.transitions
- dest_states = set()
- for state in states:
- if state in transitions:
- xs = transitions[state]
- if label in xs:
- dest_states.update(xs[label])
- if ANY in xs:
- dest_states.update(xs[ANY])
- return frozenset(self._expand(dest_states))
- def get_labels(self, states):
- transitions = self.transitions
- labels = set()
- for state in states:
- if state in transitions:
- labels.update(transitions[state])
- return labels
- def embed(self, other):
- # Copy all transitions from the other NFA into this one
- for s, othertrans in iteritems(other.transitions):
- trans = self.transitions.setdefault(s, {})
- for label, otherdests in iteritems(othertrans):
- dests = trans.setdefault(label, set())
- dests.update(otherdests)
- def insert(self, src, other, dest):
- self.embed(other)
- # Connect src to the other NFA's initial state, and the other
- # NFA's final states to dest
- self.add_transition(src, EPSILON, other.initial)
- for finalstate in other.final_states:
- self.add_transition(finalstate, EPSILON, dest)
- def to_dfa(self):
- dfa = DFA(self.start())
- frontier = [self.start()]
- seen = set()
- while frontier:
- current = frontier.pop()
- if self.is_final(current):
- dfa.add_final_state(current)
- labels = self.get_labels(current)
- for label in labels:
- if label is EPSILON:
- continue
- new_state = self.next_state(current, label)
- if new_state not in seen:
- frontier.append(new_state)
- seen.add(new_state)
- if self.is_final(new_state):
- dfa.add_final_state(new_state)
- if label is ANY:
- dfa.set_default_transition(current, new_state)
- else:
- dfa.add_transition(current, label, new_state)
- return dfa
- class DFA(FSA):
- def __init__(self, initial):
- self.initial = initial
- self.transitions = {}
- self.defaults = {}
- self.final_states = set()
- self.outlabels = {}
- def dump(self, stream=sys.stdout):
- for src in sorted(self.transitions):
- beg = "@" if src == self.initial else " "
- print(beg, src, file=stream)
- xs = self.transitions[src]
- for label in sorted(xs):
- dest = xs[label]
- end = "||" if self.is_final(dest) else ""
- def start(self):
- return self.initial
- def add_transition(self, src, label, dest):
- self.transitions.setdefault(src, {})[label] = dest
- def set_default_transition(self, src, dest):
- self.defaults[src] = dest
- def add_final_state(self, state):
- self.final_states.add(state)
- def is_final(self, state):
- return state in self.final_states
- def next_state(self, src, label):
- trans = self.transitions.get(src, {})
- return trans.get(label, self.defaults.get(src, None))
- def next_valid_string(self, string, asbytes=False):
- state = self.start()
- stack = []
- # Follow the DFA as far as possible
- i = 0
- for i, label in enumerate(string):
- stack.append((string[:i], state, label))
- state = self.next_state(state, label)
- if not state:
- break
- else:
- stack.append((string[:i + 1], state, None))
- if self.is_final(state):
- # Word is already valid
- return string
- # Perform a 'wall following' search for the lexicographically smallest
- # accepting state.
- while stack:
- path, state, label = stack.pop()
- label = self.find_next_edge(state, label, asbytes=asbytes)
- if label:
- path += label
- state = self.next_state(state, label)
- if self.is_final(state):
- return path
- stack.append((path, state, None))
- return None
- def find_next_edge(self, s, label, asbytes):
- if label is None:
- label = b"\x00" if asbytes else u'\0'
- else:
- label = (label + 1) if asbytes else unichr(ord(label) + 1)
- trans = self.transitions.get(s, {})
- if label in trans or s in self.defaults:
- return label
- try:
- labels = self.outlabels[s]
- except KeyError:
- self.outlabels[s] = labels = sorted(trans)
- pos = bisect_left(labels, label)
- if pos < len(labels):
- return labels[pos]
- return None
- def reachable_from(self, src, inclusive=True):
- transitions = self.transitions
- reached = set()
- if inclusive:
- reached.add(src)
- stack = [src]
- seen = set()
- while stack:
- src = stack.pop()
- seen.add(src)
- for _, dest in iteritems(transitions[src]):
- reached.add(dest)
- if dest not in seen:
- stack.append(dest)
- return reached
- def minimize(self):
- transitions = self.transitions
- initial = self.initial
- # Step 1: Delete unreachable states
- reachable = self.reachable_from(initial)
- for src in list(transitions):
- if src not in reachable:
- del transitions[src]
- final_states = self.final_states.intersection(reachable)
- labels = self.all_labels()
- # Step 2: Partition the states into equivalence sets
- changed = True
- parts = [final_states, reachable - final_states]
- while changed:
- changed = False
- for i in xrange(len(parts)):
- part = parts[i]
- changed_part = False
- for label in labels:
- next_part = None
- new_part = set()
- for state in part:
- dest = transitions[state].get(label)
- if dest is not None:
- if next_part is None:
- for p in parts:
- if dest in p:
- next_part = p
- elif dest not in next_part:
- new_part.add(state)
- changed = True
- changed_part = True
- if changed_part:
- old_part = part - new_part
- parts.pop(i)
- parts.append(old_part)
- parts.append(new_part)
- break
- # Choose one state from each equivalence set and map all equivalent
- # states to it
- new_trans = {}
- # Create mapping
- mapping = {}
- new_initial = None
- for part in parts:
- representative = part.pop()
- if representative is initial:
- new_initial = representative
- mapping[representative] = representative
- new_trans[representative] = {}
- for state in part:
- if state is initial:
- new_initial = representative
- mapping[state] = representative
- assert new_initial is not None
- # Apply mapping to existing transitions
- new_finals = set(mapping[s] for s in final_states)
- for state, d in iteritems(new_trans):
- trans = transitions[state]
- for label, dest in iteritems(trans):
- d[label] = mapping[dest]
- # Remove dead states - non-final states with no outgoing arcs except
- # to themselves
- non_final_srcs = [src for src in new_trans if src not in new_finals]
- removing = set()
- for src in non_final_srcs:
- dests = set(new_trans[src].values())
- dests.discard(src)
- if not dests:
- removing.add(src)
- del new_trans[src]
- # Delete transitions to removed dead states
- for t in new_trans.values():
- for label in list(t):
- if t[label] in removing:
- del t[label]
- self.transitions = new_trans
- self.initial = new_initial
- self.final_states = new_finals
- def to_dfa(self):
- return self
- # Useful functions
- def renumber_dfa(dfa, base=0):
- c = itertools.count(base)
- mapping = {}
- def remap(state):
- if state in mapping:
- newnum = mapping[state]
- else:
- newnum = next(c)
- mapping[state] = newnum
- return newnum
- newdfa = DFA(remap(dfa.initial))
- for src, trans in iteritems(dfa.transitions):
- for label, dest in iteritems(trans):
- newdfa.add_transition(remap(src), label, remap(dest))
- for finalstate in dfa.final_states:
- newdfa.add_final_state(remap(finalstate))
- for src, dest in iteritems(dfa.defaults):
- newdfa.set_default_transition(remap(src), remap(dest))
- return newdfa
- def u_to_utf8(dfa, base=0):
- c = itertools.count(base)
- transitions = dfa.transitions
- for src, trans in iteritems(transitions):
- trans = transitions[src]
- for label, dest in list(iteritems(trans)):
- if label is EPSILON:
- continue
- elif label is ANY:
- raise Exception
- else:
- assert isinstance(label, text_type)
- label8 = label.encode("utf8")
- for i, byte in enumerate(label8):
- if i < len(label8) - 1:
- st = next(c)
- dfa.add_transition(src, byte, st)
- src = st
- else:
- dfa.add_transition(src, byte, dest)
- del trans[label]
- def find_all_matches(dfa, lookup_func, first=unull):
- """
- Uses lookup_func to find all words within levenshtein distance k of word.
- Args:
- word: The word to look up
- k: Maximum edit distance
- lookup_func: A single argument function that returns the first word in the
- database that is greater than or equal to the input argument.
- Yields:
- Every matching word within levenshtein distance k from the database.
- """
- match = dfa.next_valid_string(first)
- while match:
- key = lookup_func(match)
- if key is None:
- return
- if match == key:
- yield match
- key += unull
- match = dfa.next_valid_string(key)
- # Construction functions
- def reverse_nfa(n):
- s = object()
- nfa = NFA(s)
- for src, trans in iteritems(n.transitions):
- for label, destset in iteritems(trans):
- for dest in destset:
- nfa.add_transition(dest, label, src)
- for finalstate in n.final_states:
- nfa.add_transition(s, EPSILON, finalstate)
- nfa.add_final_state(n.initial)
- return nfa
- def product(dfa1, op, dfa2):
- dfa1 = dfa1.to_dfa()
- dfa2 = dfa2.to_dfa()
- start = (dfa1.start(), dfa2.start())
- dfa = DFA(start)
- stack = [start]
- while stack:
- src = stack.pop()
- state1, state2 = src
- trans1 = set(dfa1.transitions[state1])
- trans2 = set(dfa2.transitions[state2])
- for label in trans1.intersection(trans2):
- state1 = dfa1.next_state(state1, label)
- state2 = dfa2.next_state(state2, label)
- if op(state1 is not None, state2 is not None):
- dest = (state1, state2)
- dfa.add_transition(src, label, dest)
- stack.append(dest)
- if op(dfa1.is_final(state1), dfa2.is_final(state2)):
- dfa.add_final_state(dest)
- return dfa
- def intersection(dfa1, dfa2):
- return product(dfa1, operator.and_, dfa2)
- def union(dfa1, dfa2):
- return product(dfa1, operator.or_, dfa2)
- def epsilon_nfa():
- return basic_nfa(EPSILON)
- def dot_nfa():
- return basic_nfa(ANY)
- def basic_nfa(label):
- s = object()
- e = object()
- nfa = NFA(s)
- nfa.add_transition(s, label, e)
- nfa.add_final_state(e)
- return nfa
- def charset_nfa(labels):
- s = object()
- e = object()
- nfa = NFA(s)
- for label in labels:
- nfa.add_transition(s, label, e)
- nfa.add_final_state(e)
- return nfa
- def string_nfa(string):
- s = object()
- e = object()
- nfa = NFA(s)
- for label in string:
- e = object()
- nfa.add_transition(s, label, e)
- s = e
- nfa.add_final_state(e)
- return nfa
- def choice_nfa(n1, n2):
- s = object()
- e = object()
- nfa = NFA(s)
- # -> nfa1 -
- # / \
- # s e
- # \ /
- # -> nfa2 -
- nfa.insert(s, n1, e)
- nfa.insert(s, n2, e)
- nfa.add_final_state(e)
- return nfa
- def concat_nfa(n1, n2):
- s = object()
- m = object()
- e = object()
- nfa = NFA(s)
- nfa.insert(s, n1, m)
- nfa.insert(m, n2, e)
- nfa.add_final_state(e)
- return nfa
- def star_nfa(n):
- s = object()
- e = object()
- nfa = NFA(s)
- # -----<-----
- # / \
- # s ---> n ---> e
- # \ /
- # ----->-----
- nfa.insert(s, n, e)
- nfa.add_transition(s, EPSILON, e)
- for finalstate in n.final_states:
- nfa.add_transition(finalstate, EPSILON, s)
- nfa.add_final_state(e)
- return nfa
- def plus_nfa(n):
- return concat_nfa(n, star_nfa(n))
- def optional_nfa(n):
- return choice_nfa(n, epsilon_nfa())
- # Daciuk Mihov DFA construction algorithm
- class DMNode(object):
- def __init__(self, n):
- self.n = n
- self.arcs = {}
- self.final = False
- def __repr__(self):
- return "<%s, %r>" % (self.n, self.tuple())
- def __hash__(self):
- return hash(self.tuple())
- def tuple(self):
- arcs = tuple(sorted(iteritems(self.arcs)))
- return arcs, self.final
- def strings_dfa(strings):
- dfa = DFA(0)
- c = itertools.count(1)
- last = ""
- seen = {}
- nodes = [DMNode(0)]
- for string in strings:
- if string <= last:
- raise Exception("Strings must be in order")
- if not string:
- raise Exception("Can't add empty string")
- # Find the common prefix with the previous string
- i = 0
- while i < len(last) and i < len(string) and last[i] == string[i]:
- i += 1
- prefixlen = i
- # Freeze the transitions after the prefix, since they're not shared
- add_suffix(dfa, nodes, last, prefixlen + 1, seen)
- # Create new nodes for the substring after the prefix
- for label in string[prefixlen:]:
- node = DMNode(next(c))
- # Create an arc from the previous node to this node
- nodes[-1].arcs[label] = node.n
- nodes.append(node)
- # Mark the last node as an accept state
- nodes[-1].final = True
- last = string
- if len(nodes) > 1:
- add_suffix(dfa, nodes, last, 0, seen)
- return dfa
- def add_suffix(dfa, nodes, last, downto, seen):
- while len(nodes) > downto:
- node = nodes.pop()
- tup = node.tuple()
- # If a node just like this one (final/nonfinal, same arcs to same
- # destinations) is already seen, replace with it
- try:
- this = seen[tup]
- except KeyError:
- this = node.n
- if node.final:
- dfa.add_final_state(this)
- seen[tup] = this
- else:
- # If we replaced the node with an already seen one, fix the parent
- # node's pointer to this
- parent = nodes[-1]
- inlabel = last[len(nodes) - 1]
- parent.arcs[inlabel] = this
- # Add the node's transitions to the DFA
- for label, dest in iteritems(node.arcs):
- dfa.add_transition(this, label, dest)
|