scanner.py 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453
  1. # Scanner produces tokens of the following types:
  2. # STREAM-START
  3. # STREAM-END
  4. # DIRECTIVE(name, value)
  5. # DOCUMENT-START
  6. # DOCUMENT-END
  7. # BLOCK-SEQUENCE-START
  8. # BLOCK-MAPPING-START
  9. # BLOCK-END
  10. # FLOW-SEQUENCE-START
  11. # FLOW-MAPPING-START
  12. # FLOW-SEQUENCE-END
  13. # FLOW-MAPPING-END
  14. # BLOCK-ENTRY
  15. # FLOW-ENTRY
  16. # KEY
  17. # VALUE
  18. # ALIAS(value)
  19. # ANCHOR(value)
  20. # TAG(value)
  21. # SCALAR(value, plain, style)
  22. #
  23. # Read comments in the Scanner code for more details.
  24. #
  25. __all__ = ['Scanner', 'ScannerError']
  26. from error import MarkedYAMLError
  27. from tokens import *
  28. class ScannerError(MarkedYAMLError):
  29. pass
  30. class SimpleKey(object):
  31. # See below simple keys treatment.
  32. def __init__(self, token_number, required, index, line, column, mark):
  33. self.token_number = token_number
  34. self.required = required
  35. self.index = index
  36. self.line = line
  37. self.column = column
  38. self.mark = mark
  39. class Scanner(object):
  40. def __init__(self):
  41. """Initialize the scanner."""
  42. # It is assumed that Scanner and Reader will have a common descendant.
  43. # Reader do the dirty work of checking for BOM and converting the
  44. # input data to Unicode. It also adds NUL to the end.
  45. #
  46. # Reader supports the following methods
  47. # self.peek(i=0) # peek the next i-th character
  48. # self.prefix(l=1) # peek the next l characters
  49. # self.forward(l=1) # read the next l characters and move the pointer.
  50. # Had we reached the end of the stream?
  51. self.done = False
  52. # The number of unclosed '{' and '['. `flow_level == 0` means block
  53. # context.
  54. self.flow_level = 0
  55. # List of processed tokens that are not yet emitted.
  56. self.tokens = []
  57. # Add the STREAM-START token.
  58. self.fetch_stream_start()
  59. # Number of tokens that were emitted through the `get_token` method.
  60. self.tokens_taken = 0
  61. # The current indentation level.
  62. self.indent = -1
  63. # Past indentation levels.
  64. self.indents = []
  65. # Variables related to simple keys treatment.
  66. # A simple key is a key that is not denoted by the '?' indicator.
  67. # Example of simple keys:
  68. # ---
  69. # block simple key: value
  70. # ? not a simple key:
  71. # : { flow simple key: value }
  72. # We emit the KEY token before all keys, so when we find a potential
  73. # simple key, we try to locate the corresponding ':' indicator.
  74. # Simple keys should be limited to a single line and 1024 characters.
  75. # Can a simple key start at the current position? A simple key may
  76. # start:
  77. # - at the beginning of the line, not counting indentation spaces
  78. # (in block context),
  79. # - after '{', '[', ',' (in the flow context),
  80. # - after '?', ':', '-' (in the block context).
  81. # In the block context, this flag also signifies if a block collection
  82. # may start at the current position.
  83. self.allow_simple_key = True
  84. # Keep track of possible simple keys. This is a dictionary. The key
  85. # is `flow_level`; there can be no more that one possible simple key
  86. # for each level. The value is a SimpleKey record:
  87. # (token_number, required, index, line, column, mark)
  88. # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
  89. # '[', or '{' tokens.
  90. self.possible_simple_keys = {}
  91. # Public methods.
  92. def check_token(self, *choices):
  93. # Check if the next token is one of the given types.
  94. while self.need_more_tokens():
  95. self.fetch_more_tokens()
  96. if self.tokens:
  97. if not choices:
  98. return True
  99. for choice in choices:
  100. if isinstance(self.tokens[0], choice):
  101. return True
  102. return False
  103. def peek_token(self):
  104. # Return the next token, but do not delete if from the queue.
  105. while self.need_more_tokens():
  106. self.fetch_more_tokens()
  107. if self.tokens:
  108. return self.tokens[0]
  109. def get_token(self):
  110. # Return the next token.
  111. while self.need_more_tokens():
  112. self.fetch_more_tokens()
  113. if self.tokens:
  114. self.tokens_taken += 1
  115. return self.tokens.pop(0)
  116. # Private methods.
  117. def need_more_tokens(self):
  118. if self.done:
  119. return False
  120. if not self.tokens:
  121. return True
  122. # The current token may be a potential simple key, so we
  123. # need to look further.
  124. self.stale_possible_simple_keys()
  125. if self.next_possible_simple_key() == self.tokens_taken:
  126. return True
  127. def fetch_more_tokens(self):
  128. # Eat whitespaces and comments until we reach the next token.
  129. self.scan_to_next_token()
  130. # Remove obsolete possible simple keys.
  131. self.stale_possible_simple_keys()
  132. # Compare the current indentation and column. It may add some tokens
  133. # and decrease the current indentation level.
  134. self.unwind_indent(self.column)
  135. # Peek the next character.
  136. ch = self.peek()
  137. # Is it the end of stream?
  138. if ch == u'\0':
  139. return self.fetch_stream_end()
  140. # Is it a directive?
  141. if ch == u'%' and self.check_directive():
  142. return self.fetch_directive()
  143. # Is it the document start?
  144. if ch == u'-' and self.check_document_start():
  145. return self.fetch_document_start()
  146. # Is it the document end?
  147. if ch == u'.' and self.check_document_end():
  148. return self.fetch_document_end()
  149. # TODO: support for BOM within a stream.
  150. #if ch == u'\uFEFF':
  151. # return self.fetch_bom() <-- issue BOMToken
  152. # Note: the order of the following checks is NOT significant.
  153. # Is it the flow sequence start indicator?
  154. if ch == u'[':
  155. return self.fetch_flow_sequence_start()
  156. # Is it the flow mapping start indicator?
  157. if ch == u'{':
  158. return self.fetch_flow_mapping_start()
  159. # Is it the flow sequence end indicator?
  160. if ch == u']':
  161. return self.fetch_flow_sequence_end()
  162. # Is it the flow mapping end indicator?
  163. if ch == u'}':
  164. return self.fetch_flow_mapping_end()
  165. # Is it the flow entry indicator?
  166. if ch == u',':
  167. return self.fetch_flow_entry()
  168. # Is it the block entry indicator?
  169. if ch == u'-' and self.check_block_entry():
  170. return self.fetch_block_entry()
  171. # Is it the key indicator?
  172. if ch == u'?' and self.check_key():
  173. return self.fetch_key()
  174. # Is it the value indicator?
  175. if ch == u':' and self.check_value():
  176. return self.fetch_value()
  177. # Is it an alias?
  178. if ch == u'*':
  179. return self.fetch_alias()
  180. # Is it an anchor?
  181. if ch == u'&':
  182. return self.fetch_anchor()
  183. # Is it a tag?
  184. if ch == u'!':
  185. return self.fetch_tag()
  186. # Is it a literal scalar?
  187. if ch == u'|' and not self.flow_level:
  188. return self.fetch_literal()
  189. # Is it a folded scalar?
  190. if ch == u'>' and not self.flow_level:
  191. return self.fetch_folded()
  192. # Is it a single quoted scalar?
  193. if ch == u'\'':
  194. return self.fetch_single()
  195. # Is it a double quoted scalar?
  196. if ch == u'\"':
  197. return self.fetch_double()
  198. # It must be a plain scalar then.
  199. if self.check_plain():
  200. return self.fetch_plain()
  201. # No? It's an error. Let's produce a nice error message.
  202. raise ScannerError("while scanning for the next token", None,
  203. "found character %r that cannot start any token"
  204. % ch.encode('utf-8'), self.get_mark())
  205. # Simple keys treatment.
  206. def next_possible_simple_key(self):
  207. # Return the number of the nearest possible simple key. Actually we
  208. # don't need to loop through the whole dictionary. We may replace it
  209. # with the following code:
  210. # if not self.possible_simple_keys:
  211. # return None
  212. # return self.possible_simple_keys[
  213. # min(self.possible_simple_keys.keys())].token_number
  214. min_token_number = None
  215. for level in self.possible_simple_keys:
  216. key = self.possible_simple_keys[level]
  217. if min_token_number is None or key.token_number < min_token_number:
  218. min_token_number = key.token_number
  219. return min_token_number
  220. def stale_possible_simple_keys(self):
  221. # Remove entries that are no longer possible simple keys. According to
  222. # the YAML specification, simple keys
  223. # - should be limited to a single line,
  224. # - should be no longer than 1024 characters.
  225. # Disabling this procedure will allow simple keys of any length and
  226. # height (may cause problems if indentation is broken though).
  227. for level in self.possible_simple_keys.keys():
  228. key = self.possible_simple_keys[level]
  229. if key.line != self.line \
  230. or self.index-key.index > 1024:
  231. if key.required:
  232. raise ScannerError("while scanning a simple key", key.mark,
  233. "could not find expected ':'", self.get_mark())
  234. del self.possible_simple_keys[level]
  235. def save_possible_simple_key(self):
  236. # The next token may start a simple key. We check if it's possible
  237. # and save its position. This function is called for
  238. # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
  239. # Check if a simple key is required at the current position.
  240. required = not self.flow_level and self.indent == self.column
  241. # The next token might be a simple key. Let's save it's number and
  242. # position.
  243. if self.allow_simple_key:
  244. self.remove_possible_simple_key()
  245. token_number = self.tokens_taken+len(self.tokens)
  246. key = SimpleKey(token_number, required,
  247. self.index, self.line, self.column, self.get_mark())
  248. self.possible_simple_keys[self.flow_level] = key
  249. def remove_possible_simple_key(self):
  250. # Remove the saved possible key position at the current flow level.
  251. if self.flow_level in self.possible_simple_keys:
  252. key = self.possible_simple_keys[self.flow_level]
  253. if key.required:
  254. raise ScannerError("while scanning a simple key", key.mark,
  255. "could not find expected ':'", self.get_mark())
  256. del self.possible_simple_keys[self.flow_level]
  257. # Indentation functions.
  258. def unwind_indent(self, column):
  259. ## In flow context, tokens should respect indentation.
  260. ## Actually the condition should be `self.indent >= column` according to
  261. ## the spec. But this condition will prohibit intuitively correct
  262. ## constructions such as
  263. ## key : {
  264. ## }
  265. #if self.flow_level and self.indent > column:
  266. # raise ScannerError(None, None,
  267. # "invalid intendation or unclosed '[' or '{'",
  268. # self.get_mark())
  269. # In the flow context, indentation is ignored. We make the scanner less
  270. # restrictive then specification requires.
  271. if self.flow_level:
  272. return
  273. # In block context, we may need to issue the BLOCK-END tokens.
  274. while self.indent > column:
  275. mark = self.get_mark()
  276. self.indent = self.indents.pop()
  277. self.tokens.append(BlockEndToken(mark, mark))
  278. def add_indent(self, column):
  279. # Check if we need to increase indentation.
  280. if self.indent < column:
  281. self.indents.append(self.indent)
  282. self.indent = column
  283. return True
  284. return False
  285. # Fetchers.
  286. def fetch_stream_start(self):
  287. # We always add STREAM-START as the first token and STREAM-END as the
  288. # last token.
  289. # Read the token.
  290. mark = self.get_mark()
  291. # Add STREAM-START.
  292. self.tokens.append(StreamStartToken(mark, mark,
  293. encoding=self.encoding))
  294. def fetch_stream_end(self):
  295. # Set the current intendation to -1.
  296. self.unwind_indent(-1)
  297. # Reset simple keys.
  298. self.remove_possible_simple_key()
  299. self.allow_simple_key = False
  300. self.possible_simple_keys = {}
  301. # Read the token.
  302. mark = self.get_mark()
  303. # Add STREAM-END.
  304. self.tokens.append(StreamEndToken(mark, mark))
  305. # The steam is finished.
  306. self.done = True
  307. def fetch_directive(self):
  308. # Set the current intendation to -1.
  309. self.unwind_indent(-1)
  310. # Reset simple keys.
  311. self.remove_possible_simple_key()
  312. self.allow_simple_key = False
  313. # Scan and add DIRECTIVE.
  314. self.tokens.append(self.scan_directive())
  315. def fetch_document_start(self):
  316. self.fetch_document_indicator(DocumentStartToken)
  317. def fetch_document_end(self):
  318. self.fetch_document_indicator(DocumentEndToken)
  319. def fetch_document_indicator(self, TokenClass):
  320. # Set the current intendation to -1.
  321. self.unwind_indent(-1)
  322. # Reset simple keys. Note that there could not be a block collection
  323. # after '---'.
  324. self.remove_possible_simple_key()
  325. self.allow_simple_key = False
  326. # Add DOCUMENT-START or DOCUMENT-END.
  327. start_mark = self.get_mark()
  328. self.forward(3)
  329. end_mark = self.get_mark()
  330. self.tokens.append(TokenClass(start_mark, end_mark))
  331. def fetch_flow_sequence_start(self):
  332. self.fetch_flow_collection_start(FlowSequenceStartToken)
  333. def fetch_flow_mapping_start(self):
  334. self.fetch_flow_collection_start(FlowMappingStartToken)
  335. def fetch_flow_collection_start(self, TokenClass):
  336. # '[' and '{' may start a simple key.
  337. self.save_possible_simple_key()
  338. # Increase the flow level.
  339. self.flow_level += 1
  340. # Simple keys are allowed after '[' and '{'.
  341. self.allow_simple_key = True
  342. # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
  343. start_mark = self.get_mark()
  344. self.forward()
  345. end_mark = self.get_mark()
  346. self.tokens.append(TokenClass(start_mark, end_mark))
  347. def fetch_flow_sequence_end(self):
  348. self.fetch_flow_collection_end(FlowSequenceEndToken)
  349. def fetch_flow_mapping_end(self):
  350. self.fetch_flow_collection_end(FlowMappingEndToken)
  351. def fetch_flow_collection_end(self, TokenClass):
  352. # Reset possible simple key on the current level.
  353. self.remove_possible_simple_key()
  354. # Decrease the flow level.
  355. self.flow_level -= 1
  356. # No simple keys after ']' or '}'.
  357. self.allow_simple_key = False
  358. # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
  359. start_mark = self.get_mark()
  360. self.forward()
  361. end_mark = self.get_mark()
  362. self.tokens.append(TokenClass(start_mark, end_mark))
  363. def fetch_flow_entry(self):
  364. # Simple keys are allowed after ','.
  365. self.allow_simple_key = True
  366. # Reset possible simple key on the current level.
  367. self.remove_possible_simple_key()
  368. # Add FLOW-ENTRY.
  369. start_mark = self.get_mark()
  370. self.forward()
  371. end_mark = self.get_mark()
  372. self.tokens.append(FlowEntryToken(start_mark, end_mark))
  373. def fetch_block_entry(self):
  374. # Block context needs additional checks.
  375. if not self.flow_level:
  376. # Are we allowed to start a new entry?
  377. if not self.allow_simple_key:
  378. raise ScannerError(None, None,
  379. "sequence entries are not allowed here",
  380. self.get_mark())
  381. # We may need to add BLOCK-SEQUENCE-START.
  382. if self.add_indent(self.column):
  383. mark = self.get_mark()
  384. self.tokens.append(BlockSequenceStartToken(mark, mark))
  385. # It's an error for the block entry to occur in the flow context,
  386. # but we let the parser detect this.
  387. else:
  388. pass
  389. # Simple keys are allowed after '-'.
  390. self.allow_simple_key = True
  391. # Reset possible simple key on the current level.
  392. self.remove_possible_simple_key()
  393. # Add BLOCK-ENTRY.
  394. start_mark = self.get_mark()
  395. self.forward()
  396. end_mark = self.get_mark()
  397. self.tokens.append(BlockEntryToken(start_mark, end_mark))
  398. def fetch_key(self):
  399. # Block context needs additional checks.
  400. if not self.flow_level:
  401. # Are we allowed to start a key (not nessesary a simple)?
  402. if not self.allow_simple_key:
  403. raise ScannerError(None, None,
  404. "mapping keys are not allowed here",
  405. self.get_mark())
  406. # We may need to add BLOCK-MAPPING-START.
  407. if self.add_indent(self.column):
  408. mark = self.get_mark()
  409. self.tokens.append(BlockMappingStartToken(mark, mark))
  410. # Simple keys are allowed after '?' in the block context.
  411. self.allow_simple_key = not self.flow_level
  412. # Reset possible simple key on the current level.
  413. self.remove_possible_simple_key()
  414. # Add KEY.
  415. start_mark = self.get_mark()
  416. self.forward()
  417. end_mark = self.get_mark()
  418. self.tokens.append(KeyToken(start_mark, end_mark))
  419. def fetch_value(self):
  420. # Do we determine a simple key?
  421. if self.flow_level in self.possible_simple_keys:
  422. # Add KEY.
  423. key = self.possible_simple_keys[self.flow_level]
  424. del self.possible_simple_keys[self.flow_level]
  425. self.tokens.insert(key.token_number-self.tokens_taken,
  426. KeyToken(key.mark, key.mark))
  427. # If this key starts a new block mapping, we need to add
  428. # BLOCK-MAPPING-START.
  429. if not self.flow_level:
  430. if self.add_indent(key.column):
  431. self.tokens.insert(key.token_number-self.tokens_taken,
  432. BlockMappingStartToken(key.mark, key.mark))
  433. # There cannot be two simple keys one after another.
  434. self.allow_simple_key = False
  435. # It must be a part of a complex key.
  436. else:
  437. # Block context needs additional checks.
  438. # (Do we really need them? They will be catched by the parser
  439. # anyway.)
  440. if not self.flow_level:
  441. # We are allowed to start a complex value if and only if
  442. # we can start a simple key.
  443. if not self.allow_simple_key:
  444. raise ScannerError(None, None,
  445. "mapping values are not allowed here",
  446. self.get_mark())
  447. # If this value starts a new block mapping, we need to add
  448. # BLOCK-MAPPING-START. It will be detected as an error later by
  449. # the parser.
  450. if not self.flow_level:
  451. if self.add_indent(self.column):
  452. mark = self.get_mark()
  453. self.tokens.append(BlockMappingStartToken(mark, mark))
  454. # Simple keys are allowed after ':' in the block context.
  455. self.allow_simple_key = not self.flow_level
  456. # Reset possible simple key on the current level.
  457. self.remove_possible_simple_key()
  458. # Add VALUE.
  459. start_mark = self.get_mark()
  460. self.forward()
  461. end_mark = self.get_mark()
  462. self.tokens.append(ValueToken(start_mark, end_mark))
  463. def fetch_alias(self):
  464. # ALIAS could be a simple key.
  465. self.save_possible_simple_key()
  466. # No simple keys after ALIAS.
  467. self.allow_simple_key = False
  468. # Scan and add ALIAS.
  469. self.tokens.append(self.scan_anchor(AliasToken))
  470. def fetch_anchor(self):
  471. # ANCHOR could start a simple key.
  472. self.save_possible_simple_key()
  473. # No simple keys after ANCHOR.
  474. self.allow_simple_key = False
  475. # Scan and add ANCHOR.
  476. self.tokens.append(self.scan_anchor(AnchorToken))
  477. def fetch_tag(self):
  478. # TAG could start a simple key.
  479. self.save_possible_simple_key()
  480. # No simple keys after TAG.
  481. self.allow_simple_key = False
  482. # Scan and add TAG.
  483. self.tokens.append(self.scan_tag())
  484. def fetch_literal(self):
  485. self.fetch_block_scalar(style='|')
  486. def fetch_folded(self):
  487. self.fetch_block_scalar(style='>')
  488. def fetch_block_scalar(self, style):
  489. # A simple key may follow a block scalar.
  490. self.allow_simple_key = True
  491. # Reset possible simple key on the current level.
  492. self.remove_possible_simple_key()
  493. # Scan and add SCALAR.
  494. self.tokens.append(self.scan_block_scalar(style))
  495. def fetch_single(self):
  496. self.fetch_flow_scalar(style='\'')
  497. def fetch_double(self):
  498. self.fetch_flow_scalar(style='"')
  499. def fetch_flow_scalar(self, style):
  500. # A flow scalar could be a simple key.
  501. self.save_possible_simple_key()
  502. # No simple keys after flow scalars.
  503. self.allow_simple_key = False
  504. # Scan and add SCALAR.
  505. self.tokens.append(self.scan_flow_scalar(style))
  506. def fetch_plain(self):
  507. # A plain scalar could be a simple key.
  508. self.save_possible_simple_key()
  509. # No simple keys after plain scalars. But note that `scan_plain` will
  510. # change this flag if the scan is finished at the beginning of the
  511. # line.
  512. self.allow_simple_key = False
  513. # Scan and add SCALAR. May change `allow_simple_key`.
  514. self.tokens.append(self.scan_plain())
  515. # Checkers.
  516. def check_directive(self):
  517. # DIRECTIVE: ^ '%' ...
  518. # The '%' indicator is already checked.
  519. if self.column == 0:
  520. return True
  521. def check_document_start(self):
  522. # DOCUMENT-START: ^ '---' (' '|'\n')
  523. if self.column == 0:
  524. if self.prefix(3) == u'---' \
  525. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  526. return True
  527. def check_document_end(self):
  528. # DOCUMENT-END: ^ '...' (' '|'\n')
  529. if self.column == 0:
  530. if self.prefix(3) == u'...' \
  531. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  532. return True
  533. def check_block_entry(self):
  534. # BLOCK-ENTRY: '-' (' '|'\n')
  535. return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
  536. def check_key(self):
  537. # KEY(flow context): '?'
  538. if self.flow_level:
  539. return True
  540. # KEY(block context): '?' (' '|'\n')
  541. else:
  542. return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
  543. def check_value(self):
  544. # VALUE(flow context): ':'
  545. if self.flow_level:
  546. return True
  547. # VALUE(block context): ':' (' '|'\n')
  548. else:
  549. return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
  550. def check_plain(self):
  551. # A plain scalar may start with any non-space character except:
  552. # '-', '?', ':', ',', '[', ']', '{', '}',
  553. # '#', '&', '*', '!', '|', '>', '\'', '\"',
  554. # '%', '@', '`'.
  555. #
  556. # It may also start with
  557. # '-', '?', ':'
  558. # if it is followed by a non-space character.
  559. #
  560. # Note that we limit the last rule to the block context (except the
  561. # '-' character) because we want the flow context to be space
  562. # independent.
  563. ch = self.peek()
  564. return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
  565. or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
  566. and (ch == u'-' or (not self.flow_level and ch in u'?:')))
  567. # Scanners.
  568. def scan_to_next_token(self):
  569. # We ignore spaces, line breaks and comments.
  570. # If we find a line break in the block context, we set the flag
  571. # `allow_simple_key` on.
  572. # The byte order mark is stripped if it's the first character in the
  573. # stream. We do not yet support BOM inside the stream as the
  574. # specification requires. Any such mark will be considered as a part
  575. # of the document.
  576. #
  577. # TODO: We need to make tab handling rules more sane. A good rule is
  578. # Tabs cannot precede tokens
  579. # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
  580. # KEY(block), VALUE(block), BLOCK-ENTRY
  581. # So the checking code is
  582. # if <TAB>:
  583. # self.allow_simple_keys = False
  584. # We also need to add the check for `allow_simple_keys == True` to
  585. # `unwind_indent` before issuing BLOCK-END.
  586. # Scanners for block, flow, and plain scalars need to be modified.
  587. if self.index == 0 and self.peek() == u'\uFEFF':
  588. self.forward()
  589. found = False
  590. while not found:
  591. while self.peek() == u' ':
  592. self.forward()
  593. if self.peek() == u'#':
  594. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  595. self.forward()
  596. if self.scan_line_break():
  597. if not self.flow_level:
  598. self.allow_simple_key = True
  599. else:
  600. found = True
  601. def scan_directive(self):
  602. # See the specification for details.
  603. start_mark = self.get_mark()
  604. self.forward()
  605. name = self.scan_directive_name(start_mark)
  606. value = None
  607. if name == u'YAML':
  608. value = self.scan_yaml_directive_value(start_mark)
  609. end_mark = self.get_mark()
  610. elif name == u'TAG':
  611. value = self.scan_tag_directive_value(start_mark)
  612. end_mark = self.get_mark()
  613. else:
  614. end_mark = self.get_mark()
  615. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  616. self.forward()
  617. self.scan_directive_ignored_line(start_mark)
  618. return DirectiveToken(name, value, start_mark, end_mark)
  619. def scan_directive_name(self, start_mark):
  620. # See the specification for details.
  621. length = 0
  622. ch = self.peek(length)
  623. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  624. or ch in u'-_':
  625. length += 1
  626. ch = self.peek(length)
  627. if not length:
  628. raise ScannerError("while scanning a directive", start_mark,
  629. "expected alphabetic or numeric character, but found %r"
  630. % ch.encode('utf-8'), self.get_mark())
  631. value = self.prefix(length)
  632. self.forward(length)
  633. ch = self.peek()
  634. if ch not in u'\0 \r\n\x85\u2028\u2029':
  635. raise ScannerError("while scanning a directive", start_mark,
  636. "expected alphabetic or numeric character, but found %r"
  637. % ch.encode('utf-8'), self.get_mark())
  638. return value
  639. def scan_yaml_directive_value(self, start_mark):
  640. # See the specification for details.
  641. while self.peek() == u' ':
  642. self.forward()
  643. major = self.scan_yaml_directive_number(start_mark)
  644. if self.peek() != '.':
  645. raise ScannerError("while scanning a directive", start_mark,
  646. "expected a digit or '.', but found %r"
  647. % self.peek().encode('utf-8'),
  648. self.get_mark())
  649. self.forward()
  650. minor = self.scan_yaml_directive_number(start_mark)
  651. if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
  652. raise ScannerError("while scanning a directive", start_mark,
  653. "expected a digit or ' ', but found %r"
  654. % self.peek().encode('utf-8'),
  655. self.get_mark())
  656. return (major, minor)
  657. def scan_yaml_directive_number(self, start_mark):
  658. # See the specification for details.
  659. ch = self.peek()
  660. if not (u'0' <= ch <= u'9'):
  661. raise ScannerError("while scanning a directive", start_mark,
  662. "expected a digit, but found %r" % ch.encode('utf-8'),
  663. self.get_mark())
  664. length = 0
  665. while u'0' <= self.peek(length) <= u'9':
  666. length += 1
  667. value = int(self.prefix(length))
  668. self.forward(length)
  669. return value
  670. def scan_tag_directive_value(self, start_mark):
  671. # See the specification for details.
  672. while self.peek() == u' ':
  673. self.forward()
  674. handle = self.scan_tag_directive_handle(start_mark)
  675. while self.peek() == u' ':
  676. self.forward()
  677. prefix = self.scan_tag_directive_prefix(start_mark)
  678. return (handle, prefix)
  679. def scan_tag_directive_handle(self, start_mark):
  680. # See the specification for details.
  681. value = self.scan_tag_handle('directive', start_mark)
  682. ch = self.peek()
  683. if ch != u' ':
  684. raise ScannerError("while scanning a directive", start_mark,
  685. "expected ' ', but found %r" % ch.encode('utf-8'),
  686. self.get_mark())
  687. return value
  688. def scan_tag_directive_prefix(self, start_mark):
  689. # See the specification for details.
  690. value = self.scan_tag_uri('directive', start_mark)
  691. ch = self.peek()
  692. if ch not in u'\0 \r\n\x85\u2028\u2029':
  693. raise ScannerError("while scanning a directive", start_mark,
  694. "expected ' ', but found %r" % ch.encode('utf-8'),
  695. self.get_mark())
  696. return value
  697. def scan_directive_ignored_line(self, start_mark):
  698. # See the specification for details.
  699. while self.peek() == u' ':
  700. self.forward()
  701. if self.peek() == u'#':
  702. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  703. self.forward()
  704. ch = self.peek()
  705. if ch not in u'\0\r\n\x85\u2028\u2029':
  706. raise ScannerError("while scanning a directive", start_mark,
  707. "expected a comment or a line break, but found %r"
  708. % ch.encode('utf-8'), self.get_mark())
  709. self.scan_line_break()
  710. def scan_anchor(self, TokenClass):
  711. # The specification does not restrict characters for anchors and
  712. # aliases. This may lead to problems, for instance, the document:
  713. # [ *alias, value ]
  714. # can be interpteted in two ways, as
  715. # [ "value" ]
  716. # and
  717. # [ *alias , "value" ]
  718. # Therefore we restrict aliases to numbers and ASCII letters.
  719. start_mark = self.get_mark()
  720. indicator = self.peek()
  721. if indicator == u'*':
  722. name = 'alias'
  723. else:
  724. name = 'anchor'
  725. self.forward()
  726. length = 0
  727. ch = self.peek(length)
  728. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  729. or ch in u'-_':
  730. length += 1
  731. ch = self.peek(length)
  732. if not length:
  733. raise ScannerError("while scanning an %s" % name, start_mark,
  734. "expected alphabetic or numeric character, but found %r"
  735. % ch.encode('utf-8'), self.get_mark())
  736. value = self.prefix(length)
  737. self.forward(length)
  738. ch = self.peek()
  739. if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
  740. raise ScannerError("while scanning an %s" % name, start_mark,
  741. "expected alphabetic or numeric character, but found %r"
  742. % ch.encode('utf-8'), self.get_mark())
  743. end_mark = self.get_mark()
  744. return TokenClass(value, start_mark, end_mark)
  745. def scan_tag(self):
  746. # See the specification for details.
  747. start_mark = self.get_mark()
  748. ch = self.peek(1)
  749. if ch == u'<':
  750. handle = None
  751. self.forward(2)
  752. suffix = self.scan_tag_uri('tag', start_mark)
  753. if self.peek() != u'>':
  754. raise ScannerError("while parsing a tag", start_mark,
  755. "expected '>', but found %r" % self.peek().encode('utf-8'),
  756. self.get_mark())
  757. self.forward()
  758. elif ch in u'\0 \t\r\n\x85\u2028\u2029':
  759. handle = None
  760. suffix = u'!'
  761. self.forward()
  762. else:
  763. length = 1
  764. use_handle = False
  765. while ch not in u'\0 \r\n\x85\u2028\u2029':
  766. if ch == u'!':
  767. use_handle = True
  768. break
  769. length += 1
  770. ch = self.peek(length)
  771. handle = u'!'
  772. if use_handle:
  773. handle = self.scan_tag_handle('tag', start_mark)
  774. else:
  775. handle = u'!'
  776. self.forward()
  777. suffix = self.scan_tag_uri('tag', start_mark)
  778. ch = self.peek()
  779. if ch not in u'\0 \r\n\x85\u2028\u2029':
  780. raise ScannerError("while scanning a tag", start_mark,
  781. "expected ' ', but found %r" % ch.encode('utf-8'),
  782. self.get_mark())
  783. value = (handle, suffix)
  784. end_mark = self.get_mark()
  785. return TagToken(value, start_mark, end_mark)
  786. def scan_block_scalar(self, style):
  787. # See the specification for details.
  788. if style == '>':
  789. folded = True
  790. else:
  791. folded = False
  792. chunks = []
  793. start_mark = self.get_mark()
  794. # Scan the header.
  795. self.forward()
  796. chomping, increment = self.scan_block_scalar_indicators(start_mark)
  797. self.scan_block_scalar_ignored_line(start_mark)
  798. # Determine the indentation level and go to the first non-empty line.
  799. min_indent = self.indent+1
  800. if min_indent < 1:
  801. min_indent = 1
  802. if increment is None:
  803. breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
  804. indent = max(min_indent, max_indent)
  805. else:
  806. indent = min_indent+increment-1
  807. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  808. line_break = u''
  809. # Scan the inner part of the block scalar.
  810. while self.column == indent and self.peek() != u'\0':
  811. chunks.extend(breaks)
  812. leading_non_space = self.peek() not in u' \t'
  813. length = 0
  814. while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
  815. length += 1
  816. chunks.append(self.prefix(length))
  817. self.forward(length)
  818. line_break = self.scan_line_break()
  819. breaks, end_mark = self.scan_block_scalar_breaks(indent)
  820. if self.column == indent and self.peek() != u'\0':
  821. # Unfortunately, folding rules are ambiguous.
  822. #
  823. # This is the folding according to the specification:
  824. if folded and line_break == u'\n' \
  825. and leading_non_space and self.peek() not in u' \t':
  826. if not breaks:
  827. chunks.append(u' ')
  828. else:
  829. chunks.append(line_break)
  830. # This is Clark Evans's interpretation (also in the spec
  831. # examples):
  832. #
  833. #if folded and line_break == u'\n':
  834. # if not breaks:
  835. # if self.peek() not in ' \t':
  836. # chunks.append(u' ')
  837. # else:
  838. # chunks.append(line_break)
  839. #else:
  840. # chunks.append(line_break)
  841. else:
  842. break
  843. # Chomp the tail.
  844. if chomping is not False:
  845. chunks.append(line_break)
  846. if chomping is True:
  847. chunks.extend(breaks)
  848. # We are done.
  849. return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
  850. style)
  851. def scan_block_scalar_indicators(self, start_mark):
  852. # See the specification for details.
  853. chomping = None
  854. increment = None
  855. ch = self.peek()
  856. if ch in u'+-':
  857. if ch == '+':
  858. chomping = True
  859. else:
  860. chomping = False
  861. self.forward()
  862. ch = self.peek()
  863. if ch in u'0123456789':
  864. increment = int(ch)
  865. if increment == 0:
  866. raise ScannerError("while scanning a block scalar", start_mark,
  867. "expected indentation indicator in the range 1-9, but found 0",
  868. self.get_mark())
  869. self.forward()
  870. elif ch in u'0123456789':
  871. increment = int(ch)
  872. if increment == 0:
  873. raise ScannerError("while scanning a block scalar", start_mark,
  874. "expected indentation indicator in the range 1-9, but found 0",
  875. self.get_mark())
  876. self.forward()
  877. ch = self.peek()
  878. if ch in u'+-':
  879. if ch == '+':
  880. chomping = True
  881. else:
  882. chomping = False
  883. self.forward()
  884. ch = self.peek()
  885. if ch not in u'\0 \r\n\x85\u2028\u2029':
  886. raise ScannerError("while scanning a block scalar", start_mark,
  887. "expected chomping or indentation indicators, but found %r"
  888. % ch.encode('utf-8'), self.get_mark())
  889. return chomping, increment
  890. def scan_block_scalar_ignored_line(self, start_mark):
  891. # See the specification for details.
  892. while self.peek() == u' ':
  893. self.forward()
  894. if self.peek() == u'#':
  895. while self.peek() not in u'\0\r\n\x85\u2028\u2029':
  896. self.forward()
  897. ch = self.peek()
  898. if ch not in u'\0\r\n\x85\u2028\u2029':
  899. raise ScannerError("while scanning a block scalar", start_mark,
  900. "expected a comment or a line break, but found %r"
  901. % ch.encode('utf-8'), self.get_mark())
  902. self.scan_line_break()
  903. def scan_block_scalar_indentation(self):
  904. # See the specification for details.
  905. chunks = []
  906. max_indent = 0
  907. end_mark = self.get_mark()
  908. while self.peek() in u' \r\n\x85\u2028\u2029':
  909. if self.peek() != u' ':
  910. chunks.append(self.scan_line_break())
  911. end_mark = self.get_mark()
  912. else:
  913. self.forward()
  914. if self.column > max_indent:
  915. max_indent = self.column
  916. return chunks, max_indent, end_mark
  917. def scan_block_scalar_breaks(self, indent):
  918. # See the specification for details.
  919. chunks = []
  920. end_mark = self.get_mark()
  921. while self.column < indent and self.peek() == u' ':
  922. self.forward()
  923. while self.peek() in u'\r\n\x85\u2028\u2029':
  924. chunks.append(self.scan_line_break())
  925. end_mark = self.get_mark()
  926. while self.column < indent and self.peek() == u' ':
  927. self.forward()
  928. return chunks, end_mark
  929. def scan_flow_scalar(self, style):
  930. # See the specification for details.
  931. # Note that we loose indentation rules for quoted scalars. Quoted
  932. # scalars don't need to adhere indentation because " and ' clearly
  933. # mark the beginning and the end of them. Therefore we are less
  934. # restrictive then the specification requires. We only need to check
  935. # that document separators are not included in scalars.
  936. if style == '"':
  937. double = True
  938. else:
  939. double = False
  940. chunks = []
  941. start_mark = self.get_mark()
  942. quote = self.peek()
  943. self.forward()
  944. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  945. while self.peek() != quote:
  946. chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
  947. chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
  948. self.forward()
  949. end_mark = self.get_mark()
  950. return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
  951. style)
  952. ESCAPE_REPLACEMENTS = {
  953. u'0': u'\0',
  954. u'a': u'\x07',
  955. u'b': u'\x08',
  956. u't': u'\x09',
  957. u'\t': u'\x09',
  958. u'n': u'\x0A',
  959. u'v': u'\x0B',
  960. u'f': u'\x0C',
  961. u'r': u'\x0D',
  962. u'e': u'\x1B',
  963. u' ': u'\x20',
  964. u'\"': u'\"',
  965. u'\\': u'\\',
  966. u'N': u'\x85',
  967. u'_': u'\xA0',
  968. u'L': u'\u2028',
  969. u'P': u'\u2029',
  970. }
  971. ESCAPE_CODES = {
  972. u'x': 2,
  973. u'u': 4,
  974. u'U': 8,
  975. }
  976. def scan_flow_scalar_non_spaces(self, double, start_mark):
  977. # See the specification for details.
  978. chunks = []
  979. while True:
  980. length = 0
  981. while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
  982. length += 1
  983. if length:
  984. chunks.append(self.prefix(length))
  985. self.forward(length)
  986. ch = self.peek()
  987. if not double and ch == u'\'' and self.peek(1) == u'\'':
  988. chunks.append(u'\'')
  989. self.forward(2)
  990. elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
  991. chunks.append(ch)
  992. self.forward()
  993. elif double and ch == u'\\':
  994. self.forward()
  995. ch = self.peek()
  996. if ch in self.ESCAPE_REPLACEMENTS:
  997. chunks.append(self.ESCAPE_REPLACEMENTS[ch])
  998. self.forward()
  999. elif ch in self.ESCAPE_CODES:
  1000. length = self.ESCAPE_CODES[ch]
  1001. self.forward()
  1002. for k in range(length):
  1003. if self.peek(k) not in u'0123456789ABCDEFabcdef':
  1004. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1005. "expected escape sequence of %d hexdecimal numbers, but found %r" %
  1006. (length, self.peek(k).encode('utf-8')), self.get_mark())
  1007. code = int(self.prefix(length), 16)
  1008. chunks.append(unichr(code))
  1009. self.forward(length)
  1010. elif ch in u'\r\n\x85\u2028\u2029':
  1011. self.scan_line_break()
  1012. chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
  1013. else:
  1014. raise ScannerError("while scanning a double-quoted scalar", start_mark,
  1015. "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
  1016. else:
  1017. return chunks
  1018. def scan_flow_scalar_spaces(self, double, start_mark):
  1019. # See the specification for details.
  1020. chunks = []
  1021. length = 0
  1022. while self.peek(length) in u' \t':
  1023. length += 1
  1024. whitespaces = self.prefix(length)
  1025. self.forward(length)
  1026. ch = self.peek()
  1027. if ch == u'\0':
  1028. raise ScannerError("while scanning a quoted scalar", start_mark,
  1029. "found unexpected end of stream", self.get_mark())
  1030. elif ch in u'\r\n\x85\u2028\u2029':
  1031. line_break = self.scan_line_break()
  1032. breaks = self.scan_flow_scalar_breaks(double, start_mark)
  1033. if line_break != u'\n':
  1034. chunks.append(line_break)
  1035. elif not breaks:
  1036. chunks.append(u' ')
  1037. chunks.extend(breaks)
  1038. else:
  1039. chunks.append(whitespaces)
  1040. return chunks
  1041. def scan_flow_scalar_breaks(self, double, start_mark):
  1042. # See the specification for details.
  1043. chunks = []
  1044. while True:
  1045. # Instead of checking indentation, we check for document
  1046. # separators.
  1047. prefix = self.prefix(3)
  1048. if (prefix == u'---' or prefix == u'...') \
  1049. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  1050. raise ScannerError("while scanning a quoted scalar", start_mark,
  1051. "found unexpected document separator", self.get_mark())
  1052. while self.peek() in u' \t':
  1053. self.forward()
  1054. if self.peek() in u'\r\n\x85\u2028\u2029':
  1055. chunks.append(self.scan_line_break())
  1056. else:
  1057. return chunks
  1058. def scan_plain(self):
  1059. # See the specification for details.
  1060. # We add an additional restriction for the flow context:
  1061. # plain scalars in the flow context cannot contain ',', ':' and '?'.
  1062. # We also keep track of the `allow_simple_key` flag here.
  1063. # Indentation rules are loosed for the flow context.
  1064. chunks = []
  1065. start_mark = self.get_mark()
  1066. end_mark = start_mark
  1067. indent = self.indent+1
  1068. # We allow zero indentation for scalars, but then we need to check for
  1069. # document separators at the beginning of the line.
  1070. #if indent == 0:
  1071. # indent = 1
  1072. spaces = []
  1073. while True:
  1074. length = 0
  1075. if self.peek() == u'#':
  1076. break
  1077. while True:
  1078. ch = self.peek(length)
  1079. if ch in u'\0 \t\r\n\x85\u2028\u2029' \
  1080. or (not self.flow_level and ch == u':' and
  1081. self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029') \
  1082. or (self.flow_level and ch in u',:?[]{}'):
  1083. break
  1084. length += 1
  1085. # It's not clear what we should do with ':' in the flow context.
  1086. if (self.flow_level and ch == u':'
  1087. and self.peek(length+1) not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'):
  1088. self.forward(length)
  1089. raise ScannerError("while scanning a plain scalar", start_mark,
  1090. "found unexpected ':'", self.get_mark(),
  1091. "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
  1092. if length == 0:
  1093. break
  1094. self.allow_simple_key = False
  1095. chunks.extend(spaces)
  1096. chunks.append(self.prefix(length))
  1097. self.forward(length)
  1098. end_mark = self.get_mark()
  1099. spaces = self.scan_plain_spaces(indent, start_mark)
  1100. if not spaces or self.peek() == u'#' \
  1101. or (not self.flow_level and self.column < indent):
  1102. break
  1103. return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
  1104. def scan_plain_spaces(self, indent, start_mark):
  1105. # See the specification for details.
  1106. # The specification is really confusing about tabs in plain scalars.
  1107. # We just forbid them completely. Do not use tabs in YAML!
  1108. chunks = []
  1109. length = 0
  1110. while self.peek(length) in u' ':
  1111. length += 1
  1112. whitespaces = self.prefix(length)
  1113. self.forward(length)
  1114. ch = self.peek()
  1115. if ch in u'\r\n\x85\u2028\u2029':
  1116. line_break = self.scan_line_break()
  1117. self.allow_simple_key = True
  1118. prefix = self.prefix(3)
  1119. if (prefix == u'---' or prefix == u'...') \
  1120. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  1121. return
  1122. breaks = []
  1123. while self.peek() in u' \r\n\x85\u2028\u2029':
  1124. if self.peek() == ' ':
  1125. self.forward()
  1126. else:
  1127. breaks.append(self.scan_line_break())
  1128. prefix = self.prefix(3)
  1129. if (prefix == u'---' or prefix == u'...') \
  1130. and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
  1131. return
  1132. if line_break != u'\n':
  1133. chunks.append(line_break)
  1134. elif not breaks:
  1135. chunks.append(u' ')
  1136. chunks.extend(breaks)
  1137. elif whitespaces:
  1138. chunks.append(whitespaces)
  1139. return chunks
  1140. def scan_tag_handle(self, name, start_mark):
  1141. # See the specification for details.
  1142. # For some strange reasons, the specification does not allow '_' in
  1143. # tag handles. I have allowed it anyway.
  1144. ch = self.peek()
  1145. if ch != u'!':
  1146. raise ScannerError("while scanning a %s" % name, start_mark,
  1147. "expected '!', but found %r" % ch.encode('utf-8'),
  1148. self.get_mark())
  1149. length = 1
  1150. ch = self.peek(length)
  1151. if ch != u' ':
  1152. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  1153. or ch in u'-_':
  1154. length += 1
  1155. ch = self.peek(length)
  1156. if ch != u'!':
  1157. self.forward(length)
  1158. raise ScannerError("while scanning a %s" % name, start_mark,
  1159. "expected '!', but found %r" % ch.encode('utf-8'),
  1160. self.get_mark())
  1161. length += 1
  1162. value = self.prefix(length)
  1163. self.forward(length)
  1164. return value
  1165. def scan_tag_uri(self, name, start_mark):
  1166. # See the specification for details.
  1167. # Note: we do not check if URI is well-formed.
  1168. chunks = []
  1169. length = 0
  1170. ch = self.peek(length)
  1171. while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
  1172. or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
  1173. if ch == u'%':
  1174. chunks.append(self.prefix(length))
  1175. self.forward(length)
  1176. length = 0
  1177. chunks.append(self.scan_uri_escapes(name, start_mark))
  1178. else:
  1179. length += 1
  1180. ch = self.peek(length)
  1181. if length:
  1182. chunks.append(self.prefix(length))
  1183. self.forward(length)
  1184. length = 0
  1185. if not chunks:
  1186. raise ScannerError("while parsing a %s" % name, start_mark,
  1187. "expected URI, but found %r" % ch.encode('utf-8'),
  1188. self.get_mark())
  1189. return u''.join(chunks)
  1190. def scan_uri_escapes(self, name, start_mark):
  1191. # See the specification for details.
  1192. bytes = []
  1193. mark = self.get_mark()
  1194. while self.peek() == u'%':
  1195. self.forward()
  1196. for k in range(2):
  1197. if self.peek(k) not in u'0123456789ABCDEFabcdef':
  1198. raise ScannerError("while scanning a %s" % name, start_mark,
  1199. "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
  1200. (self.peek(k).encode('utf-8')), self.get_mark())
  1201. bytes.append(chr(int(self.prefix(2), 16)))
  1202. self.forward(2)
  1203. try:
  1204. value = unicode(''.join(bytes), 'utf-8')
  1205. except UnicodeDecodeError, exc:
  1206. raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
  1207. return value
  1208. def scan_line_break(self):
  1209. # Transforms:
  1210. # '\r\n' : '\n'
  1211. # '\r' : '\n'
  1212. # '\n' : '\n'
  1213. # '\x85' : '\n'
  1214. # '\u2028' : '\u2028'
  1215. # '\u2029 : '\u2029'
  1216. # default : ''
  1217. ch = self.peek()
  1218. if ch in u'\r\n\x85':
  1219. if self.prefix(2) == u'\r\n':
  1220. self.forward(2)
  1221. else:
  1222. self.forward()
  1223. return u'\n'
  1224. elif ch in u'\u2028\u2029':
  1225. self.forward()
  1226. return ch
  1227. return u''
  1228. #try:
  1229. # import psyco
  1230. # psyco.bind(Scanner)
  1231. #except ImportError:
  1232. # pass