Tokenizer.d.ts 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. /** All the states the tokenizer can be in. */
  2. declare const enum State {
  3. Text = 1,
  4. BeforeTagName = 2,
  5. InTagName = 3,
  6. InSelfClosingTag = 4,
  7. BeforeClosingTagName = 5,
  8. InClosingTagName = 6,
  9. AfterClosingTagName = 7,
  10. BeforeAttributeName = 8,
  11. InAttributeName = 9,
  12. AfterAttributeName = 10,
  13. BeforeAttributeValue = 11,
  14. InAttributeValueDq = 12,
  15. InAttributeValueSq = 13,
  16. InAttributeValueNq = 14,
  17. BeforeDeclaration = 15,
  18. InDeclaration = 16,
  19. InProcessingInstruction = 17,
  20. BeforeComment = 18,
  21. CDATASequence = 19,
  22. InSpecialComment = 20,
  23. InCommentLike = 21,
  24. BeforeSpecialS = 22,
  25. SpecialStartSequence = 23,
  26. InSpecialTag = 24,
  27. BeforeEntity = 25,
  28. BeforeNumericEntity = 26,
  29. InNamedEntity = 27,
  30. InNumericEntity = 28,
  31. InHexEntity = 29
  32. }
  33. export interface Callbacks {
  34. onattribdata(value: string): void;
  35. onattribend(quote: string | undefined | null): void;
  36. onattribname(name: string): void;
  37. oncdata(data: string): void;
  38. onclosetag(name: string): void;
  39. oncomment(data: string): void;
  40. ondeclaration(content: string): void;
  41. onend(): void;
  42. onerror(error: Error, state?: State): void;
  43. onopentagend(): void;
  44. onopentagname(name: string): void;
  45. onprocessinginstruction(instruction: string): void;
  46. onselfclosingtag(): void;
  47. ontext(value: string): void;
  48. }
  49. export default class Tokenizer {
  50. private readonly cbs;
  51. /** The current state the tokenizer is in. */
  52. private _state;
  53. /** The read buffer. */
  54. private buffer;
  55. /** The beginning of the section that is currently being read. */
  56. sectionStart: number;
  57. /** The index within the buffer that we are currently looking at. */
  58. private _index;
  59. /**
  60. * Data that has already been processed will be removed from the buffer occasionally.
  61. * `_bufferOffset` keeps track of how many characters have been removed, to make sure position information is accurate.
  62. */
  63. private bufferOffset;
  64. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  65. private baseState;
  66. /** For special parsing behavior inside of script and style tags. */
  67. private isSpecial;
  68. /** Indicates whether the tokenizer has been paused. */
  69. private running;
  70. /** Indicates whether the tokenizer has finished running / `.end` has been called. */
  71. private ended;
  72. private readonly xmlMode;
  73. private readonly decodeEntities;
  74. private readonly entityTrie;
  75. constructor({ xmlMode, decodeEntities, }: {
  76. xmlMode?: boolean;
  77. decodeEntities?: boolean;
  78. }, cbs: Callbacks);
  79. reset(): void;
  80. write(chunk: string): void;
  81. end(chunk?: string): void;
  82. pause(): void;
  83. resume(): void;
  84. /**
  85. * The start of the current section.
  86. */
  87. getAbsoluteSectionStart(): number;
  88. /**
  89. * The current index within all of the written data.
  90. */
  91. getAbsoluteIndex(): number;
  92. private stateText;
  93. private currentSequence;
  94. private sequenceIndex;
  95. private stateSpecialStartSequence;
  96. /** Look for an end tag. For <title> tags, also decode entities. */
  97. private stateInSpecialTag;
  98. private stateCDATASequence;
  99. /**
  100. * When we wait for one specific character, we can speed things up
  101. * by skipping through the buffer until we find it.
  102. *
  103. * @returns Whether the character was found.
  104. */
  105. private fastForwardTo;
  106. /**
  107. * Comments and CDATA end with `-->` and `]]>`.
  108. *
  109. * Their common qualities are:
  110. * - Their end sequences have a distinct character they start with.
  111. * - That character is then repeated, so we have to check multiple repeats.
  112. * - All characters but the start character of the sequence can be skipped.
  113. */
  114. private stateInCommentLike;
  115. /**
  116. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
  117. *
  118. * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
  119. * We allow anything that wouldn't end the tag.
  120. */
  121. private isTagStartChar;
  122. private startSpecial;
  123. private stateBeforeTagName;
  124. private stateInTagName;
  125. private stateBeforeClosingTagName;
  126. private stateInClosingTagName;
  127. private stateAfterClosingTagName;
  128. private stateBeforeAttributeName;
  129. private stateInSelfClosingTag;
  130. private stateInAttributeName;
  131. private stateAfterAttributeName;
  132. private stateBeforeAttributeValue;
  133. private handleInAttributeValue;
  134. private stateInAttributeValueDoubleQuotes;
  135. private stateInAttributeValueSingleQuotes;
  136. private stateInAttributeValueNoQuotes;
  137. private stateBeforeDeclaration;
  138. private stateInDeclaration;
  139. private stateInProcessingInstruction;
  140. private stateBeforeComment;
  141. private stateInSpecialComment;
  142. private stateBeforeSpecialS;
  143. private trieIndex;
  144. private trieCurrent;
  145. private trieResult;
  146. private entityExcess;
  147. private stateBeforeEntity;
  148. private stateInNamedEntity;
  149. private emitNamedEntity;
  150. private stateBeforeNumericEntity;
  151. private decodeNumericEntity;
  152. private stateInNumericEntity;
  153. private stateInHexEntity;
  154. private allowLegacyEntity;
  155. /**
  156. * Remove data that has already been consumed from the buffer.
  157. */
  158. private cleanup;
  159. private shouldContinue;
  160. /**
  161. * Iterates through the buffer, calling the function corresponding to the current state.
  162. *
  163. * States that are more likely to be hit are higher up, as a performance improvement.
  164. */
  165. private parse;
  166. private finish;
  167. /** Handle any trailing data. */
  168. private handleTrailingData;
  169. private getSection;
  170. private emitPartial;
  171. }
  172. export {};
  173. //# sourceMappingURL=Tokenizer.d.ts.map