nfa.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. # Copyright 2012 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from whoosh.automata.fst import Arc
  28. class Instruction(object):
  29. def __repr__(self):
  30. return "%s()" % (self.__class__.__name__, )
  31. class Char(Instruction):
  32. """
  33. Matches a literal character.
  34. """
  35. def __init__(self, c):
  36. self.c = c
  37. def __repr__(self):
  38. return "Char(%r)" % self.c
  39. class Lit(Instruction):
  40. """
  41. Matches a literal string.
  42. """
  43. def __init__(self, c):
  44. self.c = c
  45. def __repr__(self):
  46. return "Lit(%r)" % self.c
  47. class Any(Instruction):
  48. """
  49. Matches any character.
  50. """
  51. class Match(Instruction):
  52. """
  53. Stop this thread: the string matched.
  54. """
  55. def __repr__(self):
  56. return "Match()"
  57. class Jmp(Instruction):
  58. """
  59. Jump to a specified instruction.
  60. """
  61. def __init__(self, x):
  62. self.x = x
  63. def __repr__(self):
  64. return "Jmp(%s)" % self.x
  65. class Split(Instruction):
  66. """
  67. Split execution: continue at two separate specified instructions.
  68. """
  69. def __init__(self, x, y):
  70. self.x = x
  71. self.y = y
  72. def __repr__(self):
  73. return "Split(%s, %s)" % (self.x, self.y)
  74. class Label(Instruction):
  75. """
  76. Placeholder to act as a target for JMP instructions
  77. """
  78. def __hash__(self):
  79. return id(self)
  80. def __repr__(self):
  81. return "L(%s)" % hex(id(self))
  82. def concat(e1, e2):
  83. return e1 + e2
  84. def alt(e1, e2):
  85. L1, L2, L3 = Label(), Label(), Label()
  86. return [L1] + e1 + [Jmp(L3), L2] + e2 + [L3]
  87. def zero_or_one(e):
  88. L1, L2 = Label(), Label()
  89. return [Split(L1, L2), L1] + e + [L2]
  90. def zero_or_more(e):
  91. L1, L2, L3 = Label(), Label(), Label()
  92. return [L1, Split(L2, L3), L2] + e + [Jmp(L1), L3]
  93. def one_or_more(e):
  94. L1, L2 = Label(), Label()
  95. return [L1] + e + [Split(L1, L2), L2]
  96. def fixup(program):
  97. refs = {}
  98. i = 0
  99. while i < len(program):
  100. op = program[i]
  101. if isinstance(op, Label):
  102. refs[op] = i
  103. program.pop(i)
  104. else:
  105. i += 1
  106. if refs:
  107. for op in program:
  108. if isinstance(op, (Jmp, Split)):
  109. op.x = refs[op.x]
  110. if isinstance(op, Split):
  111. op.y = refs[op.y]
  112. return program + [Match]
  113. class ThreadList(object):
  114. def __init__(self, program, max=1000):
  115. self.program = program
  116. self.max = max
  117. self.threads = []
  118. def __nonzero__(self):
  119. return bool(self.threads)
  120. def current(self):
  121. return self.threads.pop()
  122. def add(self, thread):
  123. op = self.program[thread.pc]
  124. optype = type(op)
  125. if optype is Jmp:
  126. self.add(thread.at(op.x))
  127. elif optype is Split:
  128. self.add(thread.copy_at(op.x))
  129. self.add(thread.at(op.y))
  130. else:
  131. self.threads.append(thread)
  132. class Thread(object):
  133. def __init__(self, pc, address, sofar='', accept=False):
  134. self.pc = pc
  135. self.address = address
  136. self.sofar = sofar
  137. self.accept = accept
  138. def at(self, pc):
  139. self.pc = pc
  140. return self
  141. def copy_at(self, pc):
  142. return Thread(pc, self.address, self.sofar, self.accept)
  143. def __repr__(self):
  144. d = self.__dict__
  145. return "Thread(%s)" % ",".join("%s=%r" % (k, v) for k, v in d.items())
  146. def advance(thread, arc, c):
  147. thread.pc += 1
  148. thread.address = arc.target
  149. thread.sofar += c
  150. thread.accept = arc.accept
  151. def run(graph, program, address):
  152. threads = ThreadList(program)
  153. threads.add(Thread(0, address))
  154. arc = Arc()
  155. while threads:
  156. thread = threads.current()
  157. address = thread.address
  158. op = program[thread.pc]
  159. optype = type(op)
  160. if optype is Char:
  161. if address:
  162. arc = graph.find_arc(address, op.c, arc)
  163. if arc:
  164. advance(thread, arc)
  165. threads.add(thread)
  166. elif optype is Lit:
  167. if address:
  168. c = op.c
  169. arc = graph.find_path(c, arc, address)
  170. if arc:
  171. advance(thread, arc, c)
  172. threads.add(thread)
  173. elif optype is Any:
  174. if address:
  175. sofar = thread.sofar
  176. pc = thread.pc + 1
  177. for arc in graph.iter_arcs(address, arc):
  178. t = Thread(pc, arc.target, sofar + arc.label, arc.accept)
  179. threads.add(t)
  180. elif op is Match:
  181. if thread.accept:
  182. yield thread.sofar
  183. else:
  184. raise Exception("Don't know what to do with %r" % op)
  185. LO = 0
  186. HI = 1
  187. def regex_limit(graph, mode, program, address):
  188. low = mode == LO
  189. output = []
  190. threads = ThreadList(program)
  191. threads.add(Thread(0, address))
  192. arc = Arc()
  193. while threads:
  194. thread = threads.current()
  195. address = thread.address
  196. op = program[thread.pc]
  197. optype = type(op)
  198. if optype is Char:
  199. if address:
  200. arc = graph.find_arc(address, op.c, arc)
  201. if arc:
  202. if low and arc.accept:
  203. return thread.sofar + thread.label
  204. advance(thread, arc)
  205. threads.add(thread)
  206. elif optype is Lit:
  207. if address:
  208. labels = op.c
  209. for label in labels:
  210. arc = graph.find_arc(address, label)
  211. if arc is None:
  212. return thread.sofar
  213. elif thread.accept:
  214. return thread.sofar
  215. elif optype is Any:
  216. if address:
  217. if low:
  218. arc = graph.arc_at(address, arc)
  219. else:
  220. for arc in graph.iter_arcs(address):
  221. pass
  222. advance(thread, arc, arc.label)
  223. threads.add(thread)
  224. elif thread.accept:
  225. return thread.sofar
  226. elif op is Match:
  227. return thread.sofar
  228. else:
  229. raise Exception("Don't know what to do with %r" % op)
  230. # if __name__ == "__main__":
  231. # from whoosh import index, query
  232. # from whoosh.filedb.filestore import RamStorage
  233. # from whoosh.automata import fst
  234. # from whoosh.util.testing import timing
  235. #
  236. # st = RamStorage()
  237. # gw = fst.GraphWriter(st.create_file("test"))
  238. # gw.start_field("test")
  239. # for key in ["aaaa", "aaab", "aabb", "abbb", "babb", "bbab", "bbba"]:
  240. # gw.insert(key)
  241. # gw.close()
  242. # gr = fst.GraphReader(st.open_file("test"))
  243. #
  244. # program = one_or_more([Lit("a")])
  245. # print program
  246. # program = fixup(program)
  247. # print program
  248. # print list(run(gr, program, gr.root("test")))
  249. #
  250. # ix = index.open_dir("e:/dev/src/houdini/help/index")
  251. # r = ix.reader()
  252. # gr = r._get_graph()
  253. #
  254. # # program = fixup([Any(), Any(), Any(), Any(), Any()])
  255. # # program = fixup(concat(zero_or_more([Any()]), [Char("/")]))
  256. # # with timing():
  257. # # x = list(run(gr, program, gr.root("path")))
  258. # # print len(x)
  259. #
  260. # q = query.Regex("path", "^.[abc].*/$")
  261. # with timing():
  262. # y = list(q._btexts(r))
  263. # print len(y)
  264. # print y[0], y[-1]
  265. #
  266. # pr = [Any()] + alt([Lit("c")], alt([Lit("b")], [Lit("a")])) + zero_or_more([Any()]) + [Lit("/")]
  267. # program = fixup(pr)
  268. # # with timing():
  269. # # x = list(run(gr, program, gr.root("path")))
  270. # # print len(x), x
  271. #
  272. # with timing():
  273. # print "lo=", regex_limit(gr, LO, program, gr.root("path"))
  274. # print "hi=", regex_limit(gr, HI, program, gr.root("path"))
  275. #
  276. #
  277. #
  278. # #int
  279. # #backtrackingvm(Inst *prog, char *input)
  280. # #{
  281. # # enum { MAXTHREAD = 1000 };
  282. # # Thread ready[MAXTHREAD];
  283. # # int nready;
  284. # # Inst *pc;
  285. # # char *sp;
  286. # #
  287. # # /* queue initial thread */
  288. # # ready[0] = thread(prog, input);
  289. # # nready = 1;
  290. # #
  291. # # /* run threads in stack order */
  292. # # while(nready > 0){
  293. # # --nready; /* pop state for next thread to run */
  294. # # pc = ready[nready].pc;
  295. # # sp = ready[nready].sp;
  296. # # for(;;){
  297. # # switch(pc->opcode){
  298. # # case Char:
  299. # # if(*sp != pc->c)
  300. # # goto Dead;
  301. # # pc++;
  302. # # sp++;
  303. # # continue;
  304. # # case Match:
  305. # # return 1;
  306. # # case Jmp:
  307. # # pc = pc->x;
  308. # # continue;
  309. # # case Split:
  310. # # if(nready >= MAXTHREAD){
  311. # # fprintf(stderr, "regexp overflow");
  312. # # return -1;
  313. # # }
  314. # # /* queue new thread */
  315. # # ready[nready++] = thread(pc->y, sp);
  316. # # pc = pc->x; /* continue current thread */
  317. # # continue;
  318. # # }
  319. # # }
  320. # # Dead:;
  321. # # }
  322. # # return 0;
  323. # #}
  324. #
  325. #